From b413d702b272960dcc3970394194f5328c784eeb Mon Sep 17 00:00:00 2001
From: Valerio
Date: Thu, 23 Apr 2026 14:59:29 +0200
Subject: [PATCH 01/51] feat(fetch): add fetch_smart with Reddit + Akamai
rescue paths, bump 0.5.6
---
CHANGELOG.md | 10 +++++
Cargo.lock | 14 +++----
Cargo.toml | 2 +-
crates/webclaw-fetch/src/client.rs | 59 ++++++++++++++++++++++++++----
4 files changed, 69 insertions(+), 16 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94b9ddb..54cb31f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,16 @@
All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/).
+## [0.5.6] — 2026-04-23
+
+### Added
+- `FetchClient::fetch_smart(url)` applies per-site rescue logic and returns the same `FetchResult` shape as `fetch()`. Reddit URLs route to the `.json` API, and Akamai-style challenge pages trigger a homepage cookie warmup plus a retry. Makes `/v1/scrape` on Reddit populate markdown again.
+
+### Fixed
+- Regression introduced in 0.5.4 where the production server's `/v1/scrape` bypassed the Reddit `.json` shortcut and Akamai cookie warmup that `fetch_and_extract` had been providing. Both helpers now live in `fetch_smart` and every caller path picks them up.
+
+---
+
## [0.5.5] — 2026-04-23
### Added
diff --git a/Cargo.lock b/Cargo.lock
index 30135cd..b382000 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3219,7 +3219,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
-version = "0.5.5"
+version = "0.5.6"
dependencies = [
"clap",
"dotenvy",
@@ -3240,7 +3240,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
-version = "0.5.5"
+version = "0.5.6"
dependencies = [
"ego-tree",
"once_cell",
@@ -3258,7 +3258,7 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
-version = "0.5.5"
+version = "0.5.6"
dependencies = [
"async-trait",
"bytes",
@@ -3284,7 +3284,7 @@ dependencies = [
[[package]]
name = "webclaw-llm"
-version = "0.5.5"
+version = "0.5.6"
dependencies = [
"async-trait",
"reqwest",
@@ -3297,7 +3297,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
-version = "0.5.5"
+version = "0.5.6"
dependencies = [
"dirs",
"dotenvy",
@@ -3317,7 +3317,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
-version = "0.5.5"
+version = "0.5.6"
dependencies = [
"pdf-extract",
"thiserror",
@@ -3326,7 +3326,7 @@ dependencies = [
[[package]]
name = "webclaw-server"
-version = "0.5.5"
+version = "0.5.6"
dependencies = [
"anyhow",
"axum",
diff --git a/Cargo.toml b/Cargo.toml
index abd5816..d9cfd92 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"]
[workspace.package]
-version = "0.5.5"
+version = "0.5.6"
edition = "2024"
license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw"
diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs
index e147337..d61694f 100644
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@@ -261,10 +261,52 @@ impl FetchClient {
self.cloud.as_deref()
}
+ /// Fetch a URL with per-site rescue paths: Reddit URLs redirect to the
+ /// `.json` API, and Akamai-style challenge responses trigger a homepage
+ /// cookie warmup and a retry. Returns the same `FetchResult` shape as
+ /// [`Self::fetch`] so every caller (CLI, MCP, OSS server, production
+ /// server) benefits without shape churn.
+ ///
+ /// This is the method most callers want. Use plain [`Self::fetch`] only
+ /// when you need literal no-rescue behavior (e.g. inside the rescue
+ /// logic itself to avoid recursion).
+ pub async fn fetch_smart(&self, url: &str) -> Result {
+ // Reddit: the HTML page shows a verification interstitial for most
+ // client IPs, but appending `.json` returns the post + comment tree
+ // publicly. `parse_reddit_json` in downstream code knows how to read
+ // the result; here we just do the URL swap at the fetch layer.
+ if crate::reddit::is_reddit_url(url) {
+ let json_url = crate::reddit::json_url(url);
+ if let Ok(resp) = self.fetch(&json_url).await {
+ if resp.status == 200 && !resp.html.is_empty() {
+ return Ok(resp);
+ }
+ }
+ // If the .json fetch failed, fall through to the HTML path.
+ }
+
+ let resp = self.fetch(url).await?;
+
+ // Akamai / bazadebezolkohpepadr challenge: visit the homepage to
+ // collect warmup cookies (_abck, bm_sz, etc.), then retry.
+ if is_challenge_html(&resp.html)
+ && let Some(homepage) = extract_homepage(url)
+ {
+ debug!("challenge detected, warming cookies via {homepage}");
+ let _ = self.fetch(&homepage).await;
+ if let Ok(retry) = self.fetch(url).await {
+ return Ok(retry);
+ }
+ }
+
+ Ok(resp)
+ }
+
/// Fetch a URL and return the raw HTML + response metadata.
///
/// Automatically retries on transient failures (network errors, 5xx, 429)
- /// with exponential backoff: 0s, 1s (2 attempts total).
+ /// with exponential backoff: 0s, 1s (2 attempts total). No per-site
+ /// rescue logic; use [`Self::fetch_smart`] for that.
#[instrument(skip(self), fields(url = %url))]
pub async fn fetch(&self, url: &str) -> Result {
let delays = [Duration::ZERO, Duration::from_secs(1)];
@@ -713,22 +755,23 @@ fn is_pdf_content_type(headers: &http::HeaderMap) -> bool {
/// Detect if a response looks like a bot protection challenge page.
fn is_challenge_response(response: &Response) -> bool {
- let len = response.body().len();
+ is_challenge_html(response.text().as_ref())
+}
+
+/// Same as `is_challenge_response`, operating on a body string directly
+/// so callers holding a `FetchResult` can reuse the heuristic.
+fn is_challenge_html(html: &str) -> bool {
+ let len = html.len();
if len > 15_000 || len == 0 {
return false;
}
-
- let text = response.text();
- let lower = text.to_lowercase();
-
+ let lower = html.to_lowercase();
if lower.contains("challenge page") {
return true;
}
-
if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
return true;
}
-
false
}
From 866fa88aa05d208cb5389795cfc655876742cfbc Mon Sep 17 00:00:00 2001
From: Valerio
Date: Thu, 23 Apr 2026 15:06:35 +0200
Subject: [PATCH 02/51] fix(fetch): reject HTML verification pages served at
.json reddit URL
---
crates/webclaw-fetch/src/client.rs | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs
index d61694f..78731e5 100644
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@@ -277,12 +277,18 @@ impl FetchClient {
// the result; here we just do the URL swap at the fetch layer.
if crate::reddit::is_reddit_url(url) {
let json_url = crate::reddit::json_url(url);
- if let Ok(resp) = self.fetch(&json_url).await {
- if resp.status == 200 && !resp.html.is_empty() {
+ if let Ok(resp) = self.fetch(&json_url).await
+ && resp.status == 200
+ {
+ // Reddit will serve an HTML verification page at the .json
+ // URL too when the IP is flagged. Only return if the body
+ // actually starts with a JSON payload.
+ let first = resp.html.trim_start().as_bytes().first().copied();
+ if matches!(first, Some(b'{') | Some(b'[')) {
return Ok(resp);
}
}
- // If the .json fetch failed, fall through to the HTML path.
+ // If the .json fetch failed or returned HTML, fall through.
}
let resp = self.fetch(url).await?;
From 966981bc4299323721c2d43ff5aa157bf939b82c Mon Sep 17 00:00:00 2001
From: Valerio
Date: Thu, 23 Apr 2026 15:17:04 +0200
Subject: [PATCH 03/51] fix(fetch): send bot-identifying UA on reddit .json API
to bypass browser UA block
---
crates/webclaw-fetch/src/client.rs | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs
index 78731e5..94d698f 100644
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@@ -275,14 +275,21 @@ impl FetchClient {
// client IPs, but appending `.json` returns the post + comment tree
// publicly. `parse_reddit_json` in downstream code knows how to read
// the result; here we just do the URL swap at the fetch layer.
- if crate::reddit::is_reddit_url(url) {
+ if crate::reddit::is_reddit_url(url) && !url.ends_with(".json") {
let json_url = crate::reddit::json_url(url);
- if let Ok(resp) = self.fetch(&json_url).await
+ // Reddit's public .json API serves JSON to identifiable bot
+ // User-Agents and blocks browser UAs with a verification wall.
+ // Override our Chrome-profile UA for this specific call.
+ let ua = concat!(
+ "Webclaw/",
+ env!("CARGO_PKG_VERSION"),
+ " (+https://webclaw.io)"
+ );
+ if let Ok(resp) = self
+ .fetch_with_headers(&json_url, &[("user-agent", ua)])
+ .await
&& resp.status == 200
{
- // Reddit will serve an HTML verification page at the .json
- // URL too when the IP is flagged. Only return if the body
- // actually starts with a JSON payload.
let first = resp.html.trim_start().as_bytes().first().copied();
if matches!(first, Some(b'{') | Some(b'[')) {
return Ok(resp);
From a5c3433372f33517f2aa765c2544ab6abdfe1cc7 Mon Sep 17 00:00:00 2001
From: Valerio
Date: Thu, 23 Apr 2026 15:26:31 +0200
Subject: [PATCH 04/51] fix(core+server): guard markdown pipe slice + detect
trustpilot/reddit verify walls
---
CHANGELOG.md | 3 ++-
crates/webclaw-core/src/markdown.rs | 6 ++++--
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 54cb31f..3000593 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,10 +6,11 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
## [0.5.6] — 2026-04-23
### Added
-- `FetchClient::fetch_smart(url)` applies per-site rescue logic and returns the same `FetchResult` shape as `fetch()`. Reddit URLs route to the `.json` API, and Akamai-style challenge pages trigger a homepage cookie warmup plus a retry. Makes `/v1/scrape` on Reddit populate markdown again.
+- `FetchClient::fetch_smart(url)` applies per-site rescue logic and returns the same `FetchResult` shape as `fetch()`. Reddit URLs route to the `.json` API with an identifiable bot `User-Agent`, and Akamai-style challenge pages trigger a homepage cookie warmup plus a retry. Makes `/v1/scrape` on Reddit populate markdown again.
### Fixed
- Regression introduced in 0.5.4 where the production server's `/v1/scrape` bypassed the Reddit `.json` shortcut and Akamai cookie warmup that `fetch_and_extract` had been providing. Both helpers now live in `fetch_smart` and every caller path picks them up.
+- Panic in the markdown converter (`markdown.rs:925`) on single-pipe `|` lines. A `[1..len-1]` slice on a 1-char input triggered `begin <= end`. Guarded.
---
diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs
index 1a61586..d0a2c23 100644
--- a/crates/webclaw-core/src/markdown.rs
+++ b/crates/webclaw-core/src/markdown.rs
@@ -920,8 +920,10 @@ fn strip_markdown(md: &str) -> String {
continue;
}
- // Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs
- if trimmed.starts_with('|') && trimmed.ends_with('|') {
+ // Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs.
+ // Require at least 2 chars so the slice `[1..len-1]` stays non-empty on single-pipe rows
+ // (which aren't real tables anyway); a lone `|` previously panicked at `begin <= end`.
+ if trimmed.len() >= 2 && trimmed.starts_with('|') && trimmed.ends_with('|') {
let inner = &trimmed[1..trimmed.len() - 1];
let cells: Vec<&str> = inner.split('|').map(|c| c.trim()).collect();
lines.push(cells.join("\t"));
From 4908367720e78881700b1bee52cffca71eba8724 Mon Sep 17 00:00:00 2001
From: Valerio
Date: Sun, 26 Apr 2026 17:15:44 +0200
Subject: [PATCH 05/51] docs(readme): add hosted API callout above Get Started
Surface webclaw.io as a clear alternative path for visitors who want
the antibot, JS rendering, async jobs, search, and watches the OSS
server doesn't ship. Sits between the value-prop and the install
instructions so self-host stays the primary on-ramp.
---
README.md | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/README.md b/README.md
index b752d46..d8ca2e5 100644
--- a/README.md
+++ b/README.md
@@ -54,6 +54,14 @@ It extracts clean, structured content from any URL using Chrome-level TLS finger
---
+## Two ways to use webclaw
+
+**Self-host.** Free, AGPL-3.0, runs locally. Get the CLI, MCP server, or REST API in one command. Ships with the 8 core extraction tools: scrape, crawl, map, batch, extract, summarize, diff, brand.
+
+**Hosted API** at **[webclaw.io](https://webclaw.io)**. 500 pages/month free, no card. Adds what self-hosting can't do alone: antibot bypass (Cloudflare, DataDome, WAF), JS rendering, async crawl/research jobs, web search, watches. For when you want it to *just work*.
+
+---
+
## Get Started (30 seconds)
### For AI agents (Claude, Cursor, Windsurf, VS Code)
From 5795c5c4226577a11a2781487cb01f63c91ef420 Mon Sep 17 00:00:00 2001
From: Valerio
Date: Sun, 26 Apr 2026 17:55:22 +0200
Subject: [PATCH 06/51] docs(readme): add star history chart
---
README.md | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/README.md b/README.md
index d8ca2e5..fd634bf 100644
--- a/README.md
+++ b/README.md
@@ -403,6 +403,16 @@ We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
TLS and HTTP/2 browser fingerprinting is powered by [wreq](https://github.com/0x676e67/wreq) and [http2](https://github.com/0x676e67/http2) by [@0x676e67](https://github.com/0x676e67), who pioneered browser-grade HTTP/2 fingerprinting in Rust.
+## Star History
+
+
+
+
+
+
+
+
+
## License
[AGPL-3.0](LICENSE)
From 0e6c7cdc97f5ecfd3e2caf3d86d17dab73f20f60 Mon Sep 17 00:00:00 2001
From: Valerio <88933932+0xMassi@users.noreply.github.com>
Date: Mon, 27 Apr 2026 13:18:22 +0200
Subject: [PATCH 07/51] Add GitHub Sponsors username to FUNDING.yml
Updated funding model with GitHub Sponsors username.
---
.github/FUNDING.yml | 13 +++++++++++++
1 file changed, 13 insertions(+)
create mode 100644 .github/FUNDING.yml
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000..650984e
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,13 @@
+github: [0xMassi]
+patreon:
+open_collective:
+ko_fi:
+tidelift:
+community_bridge:
+liberapay:
+issuehunt:
+lfx_crowdfunding:
+polar:
+buy_me_a_coffee:
+thanks_dev:
+custom:
From 923445f4a85f2494c0d502222a91aef7a9ce07db Mon Sep 17 00:00:00 2001
From: Valerio
Date: Thu, 30 Apr 2026 11:46:45 +0200
Subject: [PATCH 08/51] docs(readme): add h1 brand heading
The repo had no heading-level brand anchor, only a banner image and
an h3 slogan. Search engines indexing the README were missing the
canonical brand signal. The new h1 is what GitHub renders as the
title of the page and what Google co-ranks with webclaw.io.
Bumps workspace version to 0.5.7.
---
CHANGELOG.md | 7 +++++++
Cargo.toml | 2 +-
README.md | 23 ++++++++++++++++++++---
3 files changed, 28 insertions(+), 4 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3000593..a0cc9ca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,13 @@
All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/).
+## [0.5.7] — 2026-04-30
+
+### Docs
+- README header now uses an `
webclaw
` instead of an `
` slogan. The repo had no heading-level brand anchor before, only a banner image, so search engines indexing the README were missing the canonical brand signal. The new heading is what GitHub renders as the title of the page and what Google co-ranks with webclaw.io.
+
+---
+
## [0.5.6] — 2026-04-23
### Added
diff --git a/Cargo.toml b/Cargo.toml
index d9cfd92..9b55475 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"]
[workspace.package]
-version = "0.5.6"
+version = "0.5.7"
edition = "2024"
license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw"
diff --git a/README.md b/README.md
index fd634bf..623a4d3 100644
--- a/README.md
+++ b/README.md
@@ -4,10 +4,12 @@
-
- The fastest web scraper for AI agents.
+
webclaw
+
+
+ The fastest web scraper for AI agents. 67% fewer tokens. Sub-millisecond extraction. Zero browser overhead.
-
+
@@ -90,6 +92,21 @@ cargo install --git https://github.com/0xMassi/webclaw.git webclaw-cli
cargo install --git https://github.com/0xMassi/webclaw.git webclaw-mcp
```
+webclaw uses BoringSSL (via `boring-sys2`) for TLS fingerprinting, which
+needs a few system packages at build time. If `cargo install` panics with
+a `boring-sys2` build error or `Unable to find libclang`, install the
+prerequisites first:
+
+| OS | Install command |
+|---|---|
+| Debian / Ubuntu | `sudo apt install -y pkg-config libssl-dev cmake clang git build-essential` |
+| Fedora / RHEL | `sudo dnf install -y pkg-config openssl-devel cmake clang git make gcc` |
+| Arch | `sudo pacman -S pkg-config openssl cmake clang git base-devel` |
+| macOS | `xcode-select --install` (and Xcode CLT covers everything) |
+
+If you do not want to manage build dependencies yourself, prefer the
+**Homebrew**, **Docker**, or **Prebuilt binaries** options above.
+
### Docker
```bash
From 23544f8facee15958ea1f60f2befef79f22c001e Mon Sep 17 00:00:00 2001
From: Valerio
Date: Sun, 3 May 2026 21:17:23 +0200
Subject: [PATCH 09/51] docs(claude): note youtube.rs role and yt-dlp
short-circuit in server
The webclaw-core youtube module produces structured markdown but no
transcript; document that and point at the production server's
youtube_transcript.rs short-circuit for the full YoutubeData + caption
text shape.
---
CLAUDE.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CLAUDE.md b/CLAUDE.md
index c33d61f..b30bd84 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -38,6 +38,7 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R
- `filter.rs` — CSS selector include/exclude filtering (ExtractionOptions)
- `diff.rs` — Content change tracking engine (snapshot diffing)
- `brand.rs` — Brand identity extraction from DOM structure and CSS
+- `youtube.rs` — `ytInitialPlayerResponse` parser, structured markdown for `youtube.com/watch` URLs (title, channel, views, published, duration, description). Produces the legacy markdown shape — for transcripts and a structured `YoutubeData` block see the production server's `youtube_transcript.rs` short-circuit (yt-dlp via proxy pool).
### Fetch Modules (`webclaw-fetch`)
- `client.rs` — FetchClient with wreq BoringSSL TLS impersonation; implements the public `Fetcher` trait so callers (including server adapters) can swap in alternative implementations
From bdf81fe6bfc235e6e9f3acab5c2f2a8beed024de Mon Sep 17 00:00:00 2001
From: Valerio
Date: Mon, 4 May 2026 11:50:57 +0200
Subject: [PATCH 10/51] fix: harden fetch URL validation
---
Dockerfile | 8 +-
crates/webclaw-fetch/src/client.rs | 21 ++-
crates/webclaw-fetch/src/lib.rs | 1 +
crates/webclaw-fetch/src/tls.rs | 30 +++-
crates/webclaw-fetch/src/url_security.rs | 196 +++++++++++++++++++++
crates/webclaw-mcp/src/server.rs | 17 +-
crates/webclaw-server/src/error.rs | 7 +-
crates/webclaw-server/src/main.rs | 16 ++
crates/webclaw-server/src/routes/batch.rs | 10 +-
crates/webclaw-server/src/routes/scrape.rs | 5 +-
10 files changed, 284 insertions(+), 27 deletions(-)
create mode 100644 crates/webclaw-fetch/src/url_security.rs
diff --git a/Dockerfile b/Dockerfile
index 6f84e06..552aea7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -73,11 +73,9 @@ COPY --from=builder /build/target/release/webclaw-server /usr/local/bin/webclaw-
# as documentation; callers still need `-p 3000:3000` on `docker run`.
EXPOSE 3000
-# Container default: bind all interfaces so `-p 3000:3000` works. The binary
-# itself defaults to 127.0.0.1 (safe for `cargo run` on a laptop); inside
-# Docker that would make the server unreachable, so we flip it here.
-# Override with -e WEBCLAW_HOST=127.0.0.1 if you front this with another
-# process in the same container.
+# Container default: bind all interfaces so `-p 3000:3000` works. Public
+# binding requires WEBCLAW_API_KEY; the binary refuses open-auth 0.0.0.0
+# unless WEBCLAW_ALLOW_OPEN_PUBLIC=1 is set explicitly for local testing.
ENV WEBCLAW_HOST=0.0.0.0
# Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other
diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs
index 94d698f..4fff454 100644
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@@ -199,6 +199,8 @@ impl FetchClient {
config.timeout,
&config.headers,
config.proxy.as_deref(),
+ config.follow_redirects,
+ config.max_redirects,
)
})
.collect::, _>>()?;
@@ -218,7 +220,14 @@ impl FetchClient {
.iter()
.map(|proxy| {
let v = *variants.choose(&mut rng).unwrap();
- crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy))
+ crate::tls::build_client(
+ v,
+ config.timeout,
+ &config.headers,
+ Some(proxy),
+ config.follow_redirects,
+ config.max_redirects,
+ )
})
.collect::, _>>()?;
@@ -379,6 +388,8 @@ impl FetchClient {
url: &str,
extra: &[(&str, &str)],
) -> Result {
+ let parsed_url = crate::url_security::validate_public_http_url(url).await?;
+ let url = parsed_url.as_str();
let start = Instant::now();
let client = self.pick_client(url);
@@ -463,13 +474,17 @@ impl FetchClient {
url: &str,
options: &webclaw_core::ExtractionOptions,
) -> Result {
+ let parsed_url = crate::url_security::validate_public_http_url(url).await?;
+ let url = parsed_url.as_str();
+
// Reddit fallback: use their JSON API to get post + full comment tree.
if crate::reddit::is_reddit_url(url) {
let json_url = crate::reddit::json_url(url);
+ let json_url = crate::url_security::validate_public_http_url(&json_url).await?;
debug!("reddit detected, fetching {json_url}");
let client = self.pick_client(url);
- let resp = client.get(&json_url).send().await?;
+ let resp = client.get(json_url.as_str()).send().await?;
let response = Response::from_wreq(resp).await?;
if response.is_success() {
let bytes = response.body();
@@ -491,7 +506,7 @@ impl FetchClient {
&& let Some(homepage) = extract_homepage(url)
{
debug!("challenge detected, warming cookies via {homepage}");
- let _ = client.get(&homepage).send().await;
+ let _ = self.fetch(&homepage).await;
let resp = client.get(url).send().await?;
response = Response::from_wreq(resp).await?;
debug!("retried after cookie warmup: status={}", response.status());
diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs
index ca04bdb..029a7b6 100644
--- a/crates/webclaw-fetch/src/lib.rs
+++ b/crates/webclaw-fetch/src/lib.rs
@@ -15,6 +15,7 @@ pub mod proxy;
pub mod reddit;
pub mod sitemap;
pub mod tls;
+pub mod url_security;
pub use browser::BrowserProfile;
pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
diff --git a/crates/webclaw-fetch/src/tls.rs b/crates/webclaw-fetch/src/tls.rs
index 308265b..fdaeb0b 100644
--- a/crates/webclaw-fetch/src/tls.rs
+++ b/crates/webclaw-fetch/src/tls.rs
@@ -455,6 +455,8 @@ pub fn build_client(
timeout: Duration,
extra_headers: &std::collections::HashMap,
proxy: Option<&str>,
+ follow_redirects: bool,
+ max_redirects: u32,
) -> Result {
// SafariIos26 builds its Emulation on top of wreq-util's base instead
// of from scratch. See `safari_ios_emulation` for why.
@@ -490,7 +492,10 @@ pub fn build_client(
let mut builder = Client::builder()
.emulation(emulation)
- .redirect(wreq::redirect::Policy::limited(10))
+ .redirect(ssrf_safe_redirect_policy(
+ follow_redirects,
+ max_redirects as usize,
+ ))
.cookie_store(true)
.timeout(timeout);
@@ -504,3 +509,26 @@ pub fn build_client(
.build()
.map_err(|e| FetchError::Build(e.to_string()))
}
+
+fn ssrf_safe_redirect_policy(
+ follow_redirects: bool,
+ max_redirects: usize,
+) -> wreq::redirect::Policy {
+ if !follow_redirects {
+ return wreq::redirect::Policy::none();
+ }
+
+ wreq::redirect::Policy::custom(move |attempt| {
+ if attempt.previous.len() > max_redirects {
+ return attempt.error("too many redirects");
+ }
+
+ attempt.pending(|attempt| async move {
+ let next_url = attempt.uri.to_string();
+ match crate::url_security::validate_public_http_url(&next_url).await {
+ Ok(_) => attempt.follow(),
+ Err(e) => attempt.error(e.to_string()),
+ }
+ })
+ })
+}
diff --git a/crates/webclaw-fetch/src/url_security.rs b/crates/webclaw-fetch/src/url_security.rs
new file mode 100644
index 0000000..1d2b534
--- /dev/null
+++ b/crates/webclaw-fetch/src/url_security.rs
@@ -0,0 +1,196 @@
+//! SSRF guard for every server-side fetch.
+//!
+//! Callers may still do cheap parse validation at the edge, but this
+//! module is the fetch-layer authority because redirects and helper
+//! fetches also pass through it.
+
+use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
+
+use tokio::net::lookup_host;
+use url::{Host, Url};
+
+use crate::error::FetchError;
+
+/// Parse a caller-provided URL and require an HTTP(S) host.
+pub fn validate_http_url(raw: &str) -> Result {
+ let trimmed = raw.trim();
+ if trimmed.is_empty() {
+ return Err(FetchError::InvalidUrl("URL must not be empty".into()));
+ }
+
+ let parsed =
+ Url::parse(trimmed).map_err(|e| FetchError::InvalidUrl(format!("invalid URL: {e}")))?;
+ match parsed.scheme() {
+ "http" | "https" => {}
+ scheme => {
+ return Err(FetchError::InvalidUrl(format!(
+ "scheme '{scheme}' is not allowed, use http:// or https://"
+ )));
+ }
+ }
+
+ if parsed.host().is_none() {
+ return Err(FetchError::InvalidUrl("URL must include a host".into()));
+ }
+
+ Ok(parsed)
+}
+
+/// Parse, resolve, and reject private/internal destinations.
+///
+/// A domain is rejected if any resolved address is private or reserved.
+/// That is intentionally conservative: mixed public/private DNS answers
+/// are unsafe for server-side fetching.
+pub async fn validate_public_http_url(raw: &str) -> Result {
+ let parsed = validate_http_url(raw)?;
+ validate_url_host_is_public(&parsed).await?;
+ Ok(parsed)
+}
+
+async fn validate_url_host_is_public(url: &Url) -> Result<(), FetchError> {
+ match url.host() {
+ Some(Host::Ipv4(ip)) => reject_blocked_ip(IpAddr::V4(ip)),
+ Some(Host::Ipv6(ip)) => reject_blocked_ip(IpAddr::V6(ip)),
+ Some(Host::Domain(host)) => {
+ let port = url
+ .port_or_known_default()
+ .ok_or_else(|| FetchError::InvalidUrl("URL must include a known port".into()))?;
+ let addrs = lookup_host((host, port))
+ .await
+ .map_err(|e| FetchError::InvalidUrl(format!("failed to resolve host: {e}")))?;
+
+ let mut resolved = false;
+ for addr in addrs {
+ resolved = true;
+ reject_blocked_ip(addr.ip())?;
+ }
+ if !resolved {
+ return Err(FetchError::InvalidUrl(
+ "host did not resolve to any addresses".into(),
+ ));
+ }
+ Ok(())
+ }
+ None => Err(FetchError::InvalidUrl("URL must include a host".into())),
+ }
+}
+
+fn reject_blocked_ip(ip: IpAddr) -> Result<(), FetchError> {
+ if is_blocked_ip(ip) {
+ Err(FetchError::InvalidUrl(
+ "URL resolves to a blocked private or internal address".into(),
+ ))
+ } else {
+ Ok(())
+ }
+}
+
+/// Return true for IP ranges that should never be fetched server-side.
+pub fn is_blocked_ip(ip: IpAddr) -> bool {
+ match ip {
+ IpAddr::V4(ip) => is_blocked_ipv4(ip),
+ IpAddr::V6(ip) => is_blocked_ipv6(ip),
+ }
+}
+
+fn is_blocked_ipv4(ip: Ipv4Addr) -> bool {
+ let o = ip.octets();
+
+ ip.is_unspecified()
+ || ip.is_loopback()
+ || ip.is_private()
+ || ip.is_link_local()
+ || o[0] == 0
+ || o[0] >= 224
+ || (o[0] == 100 && (64..=127).contains(&o[1]))
+ || (o[0] == 192 && o[1] == 0 && o[2] == 0)
+ || (o[0] == 192 && o[1] == 0 && o[2] == 2)
+ || (o[0] == 198 && (18..=19).contains(&o[1]))
+ || (o[0] == 198 && o[1] == 51 && o[2] == 100)
+ || (o[0] == 203 && o[1] == 0 && o[2] == 113)
+}
+
+fn is_blocked_ipv6(ip: Ipv6Addr) -> bool {
+ let s = ip.segments();
+
+ ip.is_unspecified()
+ || ip.is_loopback()
+ || ip.is_multicast()
+ || (s[0] & 0xfe00) == 0xfc00
+ || (s[0] & 0xffc0) == 0xfe80
+ || (s[0] == 0x0064 && s[1] == 0xff9b && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0)
+ || (s[0] == 0x2001 && s[1] == 0x0db8)
+ || embedded_ipv4(ip).is_some_and(is_blocked_ipv4)
+}
+
+fn embedded_ipv4(ip: Ipv6Addr) -> Option {
+ let s = ip.segments();
+
+ if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0xffff {
+ return Some(Ipv4Addr::new(
+ (s[6] >> 8) as u8,
+ s[6] as u8,
+ (s[7] >> 8) as u8,
+ s[7] as u8,
+ ));
+ }
+
+ if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0 {
+ return Some(Ipv4Addr::new(
+ (s[6] >> 8) as u8,
+ s[6] as u8,
+ (s[7] >> 8) as u8,
+ s[7] as u8,
+ ));
+ }
+
+ None
+}
+
+#[cfg(test)]
+mod tests {
+ use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
+
+ use super::{is_blocked_ip, validate_public_http_url};
+
+ #[tokio::test]
+ async fn blocks_ipv4_internal_ranges() {
+ for ip in [
+ Ipv4Addr::new(0, 0, 0, 0),
+ Ipv4Addr::new(10, 0, 0, 1),
+ Ipv4Addr::new(100, 64, 0, 1),
+ Ipv4Addr::new(127, 0, 0, 1),
+ Ipv4Addr::new(169, 254, 169, 254),
+ Ipv4Addr::new(172, 16, 0, 1),
+ Ipv4Addr::new(192, 168, 0, 1),
+ Ipv4Addr::new(198, 18, 0, 1),
+ ] {
+ let url = format!("http://{ip}/");
+ assert!(validate_public_http_url(&url).await.is_err(), "{ip}");
+ }
+ }
+
+ #[tokio::test]
+ async fn blocks_ipv6_internal_ranges() {
+ for ip in [
+ Ipv6Addr::LOCALHOST,
+ Ipv6Addr::UNSPECIFIED,
+ "fc00::1".parse().unwrap(),
+ "fe80::1".parse().unwrap(),
+ "64:ff9b::7f00:1".parse().unwrap(),
+ "::ffff:127.0.0.1".parse().unwrap(),
+ ] {
+ assert!(is_blocked_ip(IpAddr::V6(ip)), "{ip}");
+ }
+ }
+
+ #[tokio::test]
+ async fn allows_public_ip_literals() {
+ assert!(
+ validate_public_http_url("https://93.184.216.34/")
+ .await
+ .is_ok()
+ );
+ assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
+ }
+}
diff --git a/crates/webclaw-mcp/src/server.rs b/crates/webclaw-mcp/src/server.rs
index 45e8647..d56032d 100644
--- a/crates/webclaw-mcp/src/server.rs
+++ b/crates/webclaw-mcp/src/server.rs
@@ -13,7 +13,6 @@ use rmcp::model::{Implementation, ServerCapabilities, ServerInfo};
use rmcp::{ServerHandler, tool, tool_handler, tool_router};
use serde_json::json;
use tracing::{error, info, warn};
-use url::Url;
use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult};
@@ -54,19 +53,9 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
/// Validate that a URL is non-empty and has an http or https scheme.
fn validate_url(url: &str) -> Result<(), String> {
- if url.is_empty() {
- return Err("Invalid URL: must not be empty".into());
- }
- match Url::parse(url) {
- Ok(parsed) if parsed.scheme() == "http" || parsed.scheme() == "https" => Ok(()),
- Ok(parsed) => Err(format!(
- "Invalid URL: scheme '{}' not allowed, must start with http:// or https://",
- parsed.scheme()
- )),
- Err(e) => Err(format!(
- "Invalid URL: {e}. Must start with http:// or https://"
- )),
- }
+ webclaw_fetch::url_security::validate_http_url(url)
+ .map(|_| ())
+ .map_err(|e| format!("Invalid URL: {e}"))
}
/// Timeout for local fetch calls (prevents hanging on tarpitting servers).
diff --git a/crates/webclaw-server/src/error.rs b/crates/webclaw-server/src/error.rs
index c49a1c9..7f1d36e 100644
--- a/crates/webclaw-server/src/error.rs
+++ b/crates/webclaw-server/src/error.rs
@@ -70,7 +70,12 @@ impl IntoResponse for ApiError {
impl From for ApiError {
fn from(e: webclaw_fetch::FetchError) -> Self {
- Self::Fetch(e.to_string())
+ match e {
+ webclaw_fetch::FetchError::InvalidUrl(msg) => {
+ Self::BadRequest(format!("invalid url: {msg}"))
+ }
+ other => Self::Fetch(other.to_string()),
+ }
}
}
diff --git a/crates/webclaw-server/src/main.rs b/crates/webclaw-server/src/main.rs
index f4cfdcb..06f2451 100644
--- a/crates/webclaw-server/src/main.rs
+++ b/crates/webclaw-server/src/main.rs
@@ -75,6 +75,15 @@ async fn main() -> anyhow::Result<()> {
.compact()
.init();
+ if is_unspecified_addr(args.host)
+ && args.api_key.is_none()
+ && std::env::var_os("WEBCLAW_ALLOW_OPEN_PUBLIC").is_none()
+ {
+ anyhow::bail!(
+ "refusing to bind 0.0.0.0/[::] without WEBCLAW_API_KEY; set WEBCLAW_API_KEY or WEBCLAW_ALLOW_OPEN_PUBLIC=1 to override"
+ );
+ }
+
let state = AppState::new(args.api_key.clone())?;
let v1 = Router::new()
@@ -121,3 +130,10 @@ async fn main() -> anyhow::Result<()> {
axum::serve(listener, app).await?;
Ok(())
}
+
+fn is_unspecified_addr(addr: IpAddr) -> bool {
+ match addr {
+ IpAddr::V4(ip) => ip.is_unspecified(),
+ IpAddr::V6(ip) => ip.is_unspecified(),
+ }
+}
diff --git a/crates/webclaw-server/src/routes/batch.rs b/crates/webclaw-server/src/routes/batch.rs
index 99533c9..18ac1f4 100644
--- a/crates/webclaw-server/src/routes/batch.rs
+++ b/crates/webclaw-server/src/routes/batch.rs
@@ -37,6 +37,14 @@ pub async fn batch(
req.urls.len()
)));
}
+ let mut safe_urls = Vec::with_capacity(req.urls.len());
+ for url in &req.urls {
+ safe_urls.push(
+ webclaw_fetch::url_security::validate_public_http_url(url)
+ .await?
+ .to_string(),
+ );
+ }
let concurrency = req.concurrency.unwrap_or(5).clamp(1, HARD_MAX_CONCURRENCY);
@@ -47,7 +55,7 @@ pub async fn batch(
include_raw_html: false,
};
- let url_refs: Vec<&str> = req.urls.iter().map(|s| s.as_str()).collect();
+ let url_refs: Vec<&str> = safe_urls.iter().map(|s| s.as_str()).collect();
let results = state
.fetch()
.fetch_and_extract_batch_with_options(&url_refs, concurrency, &options)
diff --git a/crates/webclaw-server/src/routes/scrape.rs b/crates/webclaw-server/src/routes/scrape.rs
index 1c5fc52..2f7e73f 100644
--- a/crates/webclaw-server/src/routes/scrape.rs
+++ b/crates/webclaw-server/src/routes/scrape.rs
@@ -52,6 +52,7 @@ pub async fn scrape(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
+ let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
let formats = req.formats.as_vec();
let options = ExtractionOptions {
@@ -63,11 +64,11 @@ pub async fn scrape(
let extraction = state
.fetch()
- .fetch_and_extract_with_options(&req.url, &options)
+ .fetch_and_extract_with_options(url.as_str(), &options)
.await?;
let mut body = json!({
- "url": extraction.metadata.url.clone().unwrap_or_else(|| req.url.clone()),
+ "url": extraction.metadata.url.clone().unwrap_or_else(|| url.to_string()),
"metadata": extraction.metadata,
});
let obj = body.as_object_mut().expect("json::object");
From eede2f695374cc84775e378de13418b6decb0752 Mon Sep 17 00:00:00 2001
From: Valerio
Date: Mon, 4 May 2026 12:08:11 +0200
Subject: [PATCH 11/51] docs: credit SSRF report
---
CHANGELOG.md | 3 +++
1 file changed, 3 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a0cc9ca..afec609 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,9 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
## [0.5.7] — 2026-04-30
+### Security
+- Hardened server-side URL fetching against SSRF by rejecting private/internal IP ranges and unsafe redirect targets across CLI, MCP, and the self-hosted REST server. Thanks to KairoKid / dodge1218 (vonbrubeck@gmail.com) for the responsible report.
+
### Docs
- README header now uses an `
webclaw
` instead of an `
` slogan. The repo had no heading-level brand anchor before, only a banner image, so search engines indexing the README were missing the canonical brand signal. The new heading is what GitHub renders as the title of the page and what Google co-ranks with webclaw.io.
From 1c9def2fdeec7de26d50244431502c81731db7fa Mon Sep 17 00:00:00 2001
From: Valerio
Date: Mon, 4 May 2026 14:30:06 +0200
Subject: [PATCH 12/51] fix: validate self-host route URLs consistently
---
crates/webclaw-server/src/error.rs | 11 ++++++++++-
crates/webclaw-server/src/routes/brand.rs | 3 ++-
crates/webclaw-server/src/routes/crawl.rs | 5 +++--
crates/webclaw-server/src/routes/diff.rs | 3 ++-
crates/webclaw-server/src/routes/extract.rs | 3 ++-
crates/webclaw-server/src/routes/map.rs | 3 ++-
crates/webclaw-server/src/routes/structured.rs | 5 +++--
crates/webclaw-server/src/routes/summarize.rs | 3 ++-
8 files changed, 26 insertions(+), 10 deletions(-)
diff --git a/crates/webclaw-server/src/error.rs b/crates/webclaw-server/src/error.rs
index 7f1d36e..a63848f 100644
--- a/crates/webclaw-server/src/error.rs
+++ b/crates/webclaw-server/src/error.rs
@@ -74,7 +74,16 @@ impl From for ApiError {
webclaw_fetch::FetchError::InvalidUrl(msg) => {
Self::BadRequest(format!("invalid url: {msg}"))
}
- other => Self::Fetch(other.to_string()),
+ other => {
+ let msg = other.to_string();
+ if msg.contains("invalid url:")
+ || msg.contains("blocked private or internal address")
+ {
+ Self::BadRequest(msg)
+ } else {
+ Self::Fetch(msg)
+ }
+ }
}
}
}
diff --git a/crates/webclaw-server/src/routes/brand.rs b/crates/webclaw-server/src/routes/brand.rs
index 908976a..f3f6a43 100644
--- a/crates/webclaw-server/src/routes/brand.rs
+++ b/crates/webclaw-server/src/routes/brand.rs
@@ -21,8 +21,9 @@ pub async fn brand(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
+ let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
- let fetched = state.fetch().fetch(&req.url).await?;
+ let fetched = state.fetch().fetch(url.as_str()).await?;
let brand = extract_brand(&fetched.html, Some(&fetched.url));
Ok(Json(json!({
diff --git a/crates/webclaw-server/src/routes/crawl.rs b/crates/webclaw-server/src/routes/crawl.rs
index 4d15195..9ea484c 100644
--- a/crates/webclaw-server/src/routes/crawl.rs
+++ b/crates/webclaw-server/src/routes/crawl.rs
@@ -36,6 +36,7 @@ pub async fn crawl(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
+ let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
let max_pages = req.max_pages.unwrap_or(50).min(HARD_MAX_PAGES);
let max_depth = req.max_depth.unwrap_or(3);
let concurrency = req.concurrency.unwrap_or(5).min(20);
@@ -56,8 +57,8 @@ pub async fn crawl(
cancel_flag: None,
};
- let crawler = Crawler::new(&req.url, config).map_err(ApiError::from)?;
- let result = crawler.crawl(&req.url, None).await;
+ let crawler = Crawler::new(url.as_str(), config).map_err(ApiError::from)?;
+ let result = crawler.crawl(url.as_str(), None).await;
let pages: Vec = result
.pages
diff --git a/crates/webclaw-server/src/routes/diff.rs b/crates/webclaw-server/src/routes/diff.rs
index e4e038d..b0706fb 100644
--- a/crates/webclaw-server/src/routes/diff.rs
+++ b/crates/webclaw-server/src/routes/diff.rs
@@ -75,8 +75,9 @@ pub async fn diff_route(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
+ let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
- let current = state.fetch().fetch_and_extract(&req.url).await?;
+ let current = state.fetch().fetch_and_extract(url.as_str()).await?;
let previous = req.previous.into_extraction();
let result = diff(&previous, ¤t);
diff --git a/crates/webclaw-server/src/routes/extract.rs b/crates/webclaw-server/src/routes/extract.rs
index 05b8909..55b34a0 100644
--- a/crates/webclaw-server/src/routes/extract.rs
+++ b/crates/webclaw-server/src/routes/extract.rs
@@ -43,10 +43,11 @@ pub async fn extract(
"either `schema` or `prompt` is required",
));
}
+ let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
// Fetch + extract first so we feed the LLM clean markdown instead of
// raw HTML. Cheaper tokens, better signal.
- let extraction = state.fetch().fetch_and_extract(&req.url).await?;
+ let extraction = state.fetch().fetch_and_extract(url.as_str()).await?;
let content = if extraction.content.markdown.trim().is_empty() {
extraction.content.plain_text.clone()
} else {
diff --git a/crates/webclaw-server/src/routes/map.rs b/crates/webclaw-server/src/routes/map.rs
index 846183a..6daec69 100644
--- a/crates/webclaw-server/src/routes/map.rs
+++ b/crates/webclaw-server/src/routes/map.rs
@@ -27,8 +27,9 @@ pub async fn map(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
+ let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
- let entries = sitemap::discover(state.fetch(), &req.url).await?;
+ let entries = sitemap::discover(state.fetch(), url.as_str()).await?;
let body = if req.include_metadata {
json!({
diff --git a/crates/webclaw-server/src/routes/structured.rs b/crates/webclaw-server/src/routes/structured.rs
index c9cdc1a..9c10b67 100644
--- a/crates/webclaw-server/src/routes/structured.rs
+++ b/crates/webclaw-server/src/routes/structured.rs
@@ -25,7 +25,7 @@ impl From for ApiError {
match e {
ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
- ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()),
+ ExtractorDispatchError::Fetch(f) => ApiError::from(f),
}
}
}
@@ -46,7 +46,8 @@ pub async fn scrape_vertical(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
- let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?;
+ let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
+ let data = extractors::dispatch_by_name(state.fetch(), &vertical, url.as_str()).await?;
Ok(Json(json!({
"vertical": vertical,
"url": req.url,
diff --git a/crates/webclaw-server/src/routes/summarize.rs b/crates/webclaw-server/src/routes/summarize.rs
index b967f1f..6b645ab 100644
--- a/crates/webclaw-server/src/routes/summarize.rs
+++ b/crates/webclaw-server/src/routes/summarize.rs
@@ -22,8 +22,9 @@ pub async fn summarize_route(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
+ let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
- let extraction = state.fetch().fetch_and_extract(&req.url).await?;
+ let extraction = state.fetch().fetch_and_extract(url.as_str()).await?;
let content = if extraction.content.markdown.trim().is_empty() {
extraction.content.plain_text.clone()
} else {
From 72b8dbc2852e0dbff0b961cbc9519877a7f364b4 Mon Sep 17 00:00:00 2001
From: Valerio
Date: Mon, 4 May 2026 21:25:07 +0200
Subject: [PATCH 13/51] fix: improve brand extraction signals
---
Cargo.lock | 14 +-
Cargo.toml | 3 +-
crates/webclaw-core/src/brand.rs | 264 ++++++++++++++++++++++++++-----
3 files changed, 234 insertions(+), 47 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index b382000..4a6b90e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3219,7 +3219,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
-version = "0.5.6"
+version = "0.5.8"
dependencies = [
"clap",
"dotenvy",
@@ -3240,7 +3240,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
-version = "0.5.6"
+version = "0.5.8"
dependencies = [
"ego-tree",
"once_cell",
@@ -3258,7 +3258,7 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
-version = "0.5.6"
+version = "0.5.8"
dependencies = [
"async-trait",
"bytes",
@@ -3284,7 +3284,7 @@ dependencies = [
[[package]]
name = "webclaw-llm"
-version = "0.5.6"
+version = "0.5.8"
dependencies = [
"async-trait",
"reqwest",
@@ -3297,7 +3297,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
-version = "0.5.6"
+version = "0.5.8"
dependencies = [
"dirs",
"dotenvy",
@@ -3317,7 +3317,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
-version = "0.5.6"
+version = "0.5.8"
dependencies = [
"pdf-extract",
"thiserror",
@@ -3326,7 +3326,7 @@ dependencies = [
[[package]]
name = "webclaw-server"
-version = "0.5.6"
+version = "0.5.8"
dependencies = [
"anyhow",
"axum",
diff --git a/Cargo.toml b/Cargo.toml
index 9b55475..f77595d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"]
[workspace.package]
-version = "0.5.7"
+version = "0.5.8"
edition = "2024"
license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw"
@@ -21,4 +21,3 @@ tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
clap = { version = "4", features = ["derive", "env"] }
dotenvy = "0.15"
-
diff --git a/crates/webclaw-core/src/brand.rs b/crates/webclaw-core/src/brand.rs
index 52eb1b7..8f6de53 100644
--- a/crates/webclaw-core/src/brand.rs
+++ b/crates/webclaw-core/src/brand.rs
@@ -79,9 +79,19 @@ static HSL_COLOR: Lazy = Lazy::new(|| {
.unwrap()
});
-/// Matches font-family values
-static FONT_FAMILY: Lazy =
- Lazy::new(|| Regex::new(r"(?i)font-family\s*:\s*([^;}{]+)").unwrap());
+/// Matches the family tail of CSS `font:` shorthand after size/line-height.
+static FONT_SHORTHAND_FAMILY: Lazy = Lazy::new(|| {
+ Regex::new(
+ r#"(?ix)
+ (?:^|\s)
+ (?:xx-small|x-small|small|medium|large|x-large|xx-large|larger|smaller|\d*\.?\d+(?:px|rem|em|pt|pc|in|cm|mm|%|vw|vh|vmin|vmax))
+ (?:\s*/\s*[^\s,]+)?
+ \s+
+ (.+)$
+ "#,
+ )
+ .unwrap()
+});
macro_rules! selector {
($s:expr) => {{
@@ -102,12 +112,12 @@ pub fn extract_brand(html: &str, url: Option<&str>) -> BrandIdentity {
let doc = Html::parse_document(html);
let base_url = url.and_then(|u| Url::parse(u).ok());
+ let name = extract_brand_name(&doc);
let css_sources = collect_css(&doc);
- let colors = extract_colors(&css_sources);
- let fonts = extract_fonts(&css_sources);
+ let colors = extract_colors(&css_sources, name.as_deref());
+ let fonts = extract_fonts(&css_sources, name.as_deref());
let logo_url = find_logo(&doc, base_url.as_ref());
let favicon_url = find_favicon(&doc, base_url.as_ref());
- let name = extract_brand_name(&doc);
let logos = find_all_logos(&doc, base_url.as_ref());
let og_image = find_og_image(&doc, base_url.as_ref());
@@ -390,7 +400,7 @@ fn is_boring_color(hex: &str) -> bool {
)
}
-fn extract_colors(decls: &[CssDecl]) -> Vec {
+fn extract_colors(decls: &[CssDecl], brand_name: Option<&str>) -> Vec {
// Track (hex, usage) -> count
let mut counts: HashMap> = HashMap::new();
@@ -429,6 +439,8 @@ fn extract_colors(decls: &[CssDecl]) -> Vec {
// Sort by frequency (descending)
colors.sort_by_key(|c| std::cmp::Reverse(c.count));
+ demote_or_remove_oauth_palette(&mut colors, brand_name);
+
// Promote top non-white/black to Primary/Secondary if they're still Unknown
let mut assigned_primary = colors.iter().any(|c| c.usage == ColorUsage::Primary);
let mut assigned_secondary = colors.iter().any(|c| c.usage == ColorUsage::Secondary);
@@ -450,6 +462,28 @@ fn extract_colors(decls: &[CssDecl]) -> Vec {
colors
}
+const GOOGLE_OAUTH_COLORS: &[&str] = &[
+ "#1A73E8", "#4285F4", "#34A853", "#FBBC05", "#EA4335", "#5F6368", "#202124", "#E8EAED",
+ "#F1F3F4",
+];
+
+fn demote_or_remove_oauth_palette(colors: &mut Vec, brand_name: Option<&str>) {
+ let brand = brand_name.unwrap_or("").to_ascii_lowercase();
+ if brand.contains("google") {
+ return;
+ }
+
+ let google_hits = colors
+ .iter()
+ .filter(|c| GOOGLE_OAUTH_COLORS.contains(&c.hex.as_str()))
+ .count();
+ if google_hits < 3 {
+ return;
+ }
+
+ colors.retain(|c| !GOOGLE_OAUTH_COLORS.contains(&c.hex.as_str()));
+}
+
fn classify_color_property(property: &str) -> ColorUsage {
match property {
"background-color" | "background" => ColorUsage::Background,
@@ -584,31 +618,55 @@ const GENERIC_FONTS: &[&str] = &[
"initial",
"unset",
"revert",
+ "arial",
+ "times",
+ "times new roman",
+ "courier new",
+ "georgia",
+ "menlo",
+ "monaco",
+ "consolas",
+ "liberation mono",
+ "sf mono",
+ "sfmono-regular",
+ "source code pro",
+ "apple color emoji",
+ "segoe ui",
+ "segoe ui emoji",
+ "segoe ui symbol",
+ "noto color emoji",
+ "blinkmacsystemfont",
+ "-apple-system",
];
-fn extract_fonts(decls: &[CssDecl]) -> Vec {
+fn extract_fonts(decls: &[CssDecl], brand_name: Option<&str>) -> Vec {
let mut freq: HashMap = HashMap::new();
+ let brand = brand_name.unwrap_or("").to_ascii_lowercase();
for decl in decls {
if decl.property != "font-family" && decl.property != "font" {
continue;
}
- // For shorthand `font:`, try to extract font-family portion
+ // For shorthand `font:`, extract only the family tail after the
+ // size/line-height token. The previous implementation treated values
+ // like `500 12px Roboto` as a font family, which polluted `/v1/brand`
+ // output with CSS declarations instead of usable family names.
let family_str = if decl.property == "font" {
- // font shorthand: the font-family is the last part after the size.
- // Heuristic: take everything after a `/` or after `px`/`em`/`rem`/`%` + space
- FONT_FAMILY
- .captures(&format!("font-family: {}", &decl.value))
- .map(|c| c[1].to_string())
- .unwrap_or_else(|| decl.value.clone())
+ match parse_font_shorthand_family(&decl.value) {
+ Some(family) => family,
+ None => continue,
+ }
} else {
decl.value.clone()
};
for font in split_font_families(&family_str) {
let lower = font.to_lowercase();
- if !GENERIC_FONTS.contains(&lower.as_str()) && !is_junk_font_name(&lower) {
+ if !GENERIC_FONTS.contains(&lower.as_str())
+ && !is_junk_font_name(&lower)
+ && !is_third_party_auth_font(&lower, &brand)
+ {
*freq.entry(font).or_insert(0) += 1;
}
}
@@ -619,6 +677,32 @@ fn extract_fonts(decls: &[CssDecl]) -> Vec {
fonts.into_iter().map(|(name, _)| name).collect()
}
+fn is_third_party_auth_font(name: &str, brand_name: &str) -> bool {
+ !brand_name.contains("google") && name.contains("google sans")
+}
+
+fn parse_font_shorthand_family(value: &str) -> Option {
+ let caps = FONT_SHORTHAND_FAMILY.captures(value)?;
+ let mut family = caps.get(1)?.as_str().trim().to_string();
+
+ // Drop the optional slash line-height residue if it was not consumed due
+ // to unusual whitespace, then leave comma-separated family names intact.
+ if let Some(stripped) = family.strip_prefix('/') {
+ family = stripped
+ .split_once(' ')
+ .map(|(_, rest)| rest)
+ .unwrap_or("")
+ .trim()
+ .to_string();
+ }
+
+ if family.is_empty() {
+ None
+ } else {
+ Some(family)
+ }
+}
+
/// Filter out junk font names: CSS variables, hex hashes (Next.js font optimization),
/// single-character names, and other non-human-readable values.
fn is_junk_font_name(name: &str) -> bool {
@@ -630,10 +714,43 @@ fn is_junk_font_name(name: &str) -> bool {
if name.len() >= 8 && name.chars().all(|c| c.is_ascii_hexdigit()) {
return true;
}
+ if name
+ .split_whitespace()
+ .next()
+ .is_some_and(|part| part.len() >= 8 && part.chars().all(|c| c.is_ascii_hexdigit()))
+ {
+ return true;
+ }
// Too short to be a real font name
if name.len() < 3 {
return true;
}
+ // Third-party rendering libraries and icon fonts overwhelm app shells
+ // like claude.com/openai.com but are not product typography.
+ if name.contains("katex")
+ || name.contains("open dyslexic")
+ || name.contains("opendyslexic")
+ || name.contains("math")
+ || name.contains("fraktur")
+ || name.contains("caligraphic")
+ || name.contains("typewriter")
+ || name.contains("glyph")
+ || name.contains("icon")
+ || name.contains("emoji")
+ || name.contains("symbol")
+ {
+ return true;
+ }
+ // Malformed shorthand leftovers and CSS-internal values.
+ if name.contains(')')
+ || name.contains('!')
+ || name.contains('/')
+ || name.contains("px ")
+ || name.contains("rem ")
+ || name.contains("em ")
+ {
+ return true;
+ }
// Starts with underscore or double dash (CSS internals)
if name.starts_with('_') || name.starts_with("--") {
return true;
@@ -662,28 +779,11 @@ fn split_font_families(value: &str) -> Vec {
// ---------------------------------------------------------------------------
fn find_logo(doc: &Html, base_url: Option<&Url>) -> Option {
- // Strategy 1: with class/id containing "logo"
- for el in doc.select(selector!("img")) {
- let class = el.value().attr("class").unwrap_or("");
- let id = el.value().attr("id").unwrap_or("");
- if (contains_ci(class, "logo") || contains_ci(id, "logo"))
- && let Some(src) = el.value().attr("src")
- {
- return Some(resolve_url(src, base_url));
- }
+ if let Some(url) = find_logo_in_scope(doc, base_url, "header img, nav img") {
+ return Some(url);
}
- // Strategy 2: with alt containing "logo"
- for el in doc.select(selector!("img")) {
- let alt = el.value().attr("alt").unwrap_or("");
- if contains_ci(alt, "logo")
- && let Some(src) = el.value().attr("src")
- {
- return Some(resolve_url(src, base_url));
- }
- }
-
- // Strategy 3: containing an (homepage link with image)
+ // Strategy 2: containing an (homepage link with image)
for el in doc.select(selector!("a[href='/'] img, a[href] img")) {
// Check if parent links to homepage
if let Some(parent) = el.parent().and_then(|p| p.value().as_element()) {
@@ -699,6 +799,20 @@ fn find_logo(doc: &Html, base_url: Option<&Url>) -> Option {
None
}
+fn find_logo_in_scope(doc: &Html, base_url: Option<&Url>, selector_str: &str) -> Option {
+ let selector = Selector::parse(selector_str).ok()?;
+ for el in doc.select(&selector) {
+ let class = el.value().attr("class").unwrap_or("");
+ let id = el.value().attr("id").unwrap_or("");
+ let alt = el.value().attr("alt").unwrap_or("");
+ let src = el.value().attr("src")?;
+ if contains_ci(class, "logo") || contains_ci(id, "logo") || contains_ci(alt, "logo") {
+ return Some(resolve_url(src, base_url));
+ }
+ }
+ None
+}
+
// ---------------------------------------------------------------------------
// Favicon detection
// ---------------------------------------------------------------------------
@@ -829,8 +943,9 @@ fn find_all_logos(doc: &Html, base_url: Option<&Url>) -> Vec {
}
}
- // Logo images (class/id/alt containing "logo")
- for el in doc.select(selector!("img")) {
+ // Logo images in header/nav first. Product/customer logo grids elsewhere
+ // are common on SaaS sites and should not become the primary brand signal.
+ for el in doc.select(selector!("header img, nav img")) {
let class = el.value().attr("class").unwrap_or("");
let id = el.value().attr("id").unwrap_or("");
let alt = el.value().attr("alt").unwrap_or("");
@@ -997,6 +1112,25 @@ mod tests {
assert!(hexes.contains(&"#3498DB"), "brand color should survive");
}
+ #[test]
+ fn test_google_oauth_palette_does_not_overwhelm_non_google_brand() {
+ let html = r#"
+
+
+ "#;
+
+ let brand = extract_brand(html, None);
+ let hexes: Vec<&str> = brand.colors.iter().map(|c| c.hex.as_str()).collect();
+ assert!(!hexes.contains(&"#1A73E8"));
+ assert!(!hexes.contains(&"#4285F4"));
+ assert!(hexes.contains(&"#D97757"));
+ assert!(hexes.contains(&"#DC6038"));
+ }
+
#[test]
fn test_font_extraction() {
let html = r#""#;
+
+ let brand = extract_brand(html, None);
+ assert!(brand.fonts.contains(&"Roboto".to_string()));
+ assert!(brand.fonts.contains(&"OpenAI Sans".to_string()));
+ assert!(!brand.fonts.iter().any(|f| f.contains("12px")));
+ assert!(!brand.fonts.iter().any(|f| f.contains("KaTeX")));
+ assert!(!brand.fonts.iter().any(|f| f.contains("Emoji")));
+ assert!(!brand.fonts.iter().any(|f| f.contains("9d9927955a95a20d")));
+ }
+
#[test]
fn test_logo_by_class() {
let html = r#"
@@ -1086,6 +1238,42 @@ mod tests {
);
}
+ #[test]
+ fn test_body_logo_grid_does_not_become_primary_brand_logo() {
+ let html = r#"
+
+
+
+
+
+
+ "#;
+
+ let brand = extract_brand(html, Some("https://example.com"));
+ assert_eq!(brand.logo_url, None);
+ assert!(brand.logos.is_empty());
+ }
+
+ #[test]
+ fn test_header_logo_is_still_primary_logo() {
+ let html = r#"
+
+
+
+
+
+
+ "#;
+
+ let brand = extract_brand(html, Some("https://example.com"));
+ assert_eq!(
+ brand.logo_url.as_deref(),
+ Some("https://example.com/logo.svg")
+ );
+ assert_eq!(brand.logos.len(), 1);
+ assert_eq!(brand.logos[0].url, "https://example.com/logo.svg");
+ }
+
#[test]
fn test_favicon() {
let html = r#"
From 615f3266603a7915b1e898c74ad1eb3a895e6c2f Mon Sep 17 00:00:00 2001
From: Valerio
Date: Mon, 4 May 2026 21:52:49 +0200
Subject: [PATCH 14/51] docs: update changelog for brand extraction
---
CHANGELOG.md | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index afec609..01e4612 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,13 @@
All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/).
+## [0.5.8] — 2026-05-04
+
+### Fixed
+- Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise.
+
+---
+
## [0.5.7] — 2026-04-30
### Security
From a542e45768d54dc7f028485df7d18b6d8954b5e7 Mon Sep 17 00:00:00 2001
From: Justin Levine <20596508+jal-co@users.noreply.github.com>
Date: Tue, 5 May 2026 02:17:21 -0700
Subject: [PATCH 15/51] docs: refresh README badges
Replace README badges with shieldcn-styled badges.
---
README.md | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/README.md b/README.md
index 623a4d3..4362d35 100644
--- a/README.md
+++ b/README.md
@@ -12,16 +12,16 @@
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
---
From a1242a1c1d116c142c6a98ee18e27f50a90d201d Mon Sep 17 00:00:00 2001
From: Valerio
Date: Tue, 5 May 2026 11:18:58 +0200
Subject: [PATCH 16/51] docs: credit README badge refresh
---
CHANGELOG.md | 3 +++
1 file changed, 3 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 01e4612..53f636f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,9 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
### Fixed
- Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise.
+### Docs
+- Refreshed the README badges with a cleaner shieldcn style. Thanks to Justin Levine (`@jal-co`) for the contribution, and shout-out to his open-source [shieldcn](https://github.com/jal-co/shieldcn) project.
+
---
## [0.5.7] — 2026-04-30
From 513b0e493eaa7a7e47f5cb44880bb837be312477 Mon Sep 17 00:00:00 2001
From: SURYANSH MISHRA
Date: Tue, 5 May 2026 11:38:30 +0200
Subject: [PATCH 17/51] ci: add Windows release artifacts
Closes #34
---
.github/workflows/release.yml | 36 +++++++++++++++++++++++++++--------
CHANGELOG.md | 3 +++
2 files changed, 31 insertions(+), 8 deletions(-)
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4c4c241..b2ea54a 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -27,6 +27,8 @@ jobs:
os: ubuntu-latest
- target: aarch64-unknown-linux-gnu
os: ubuntu-latest
+ - target: x86_64-pc-windows-msvc
+ os: windows-latest
steps:
- uses: actions/checkout@v4
@@ -57,6 +59,12 @@ jobs:
if: matrix.target != 'aarch64-unknown-linux-gnu' && runner.os == 'Linux'
run: sudo apt-get update && sudo apt-get install -y cmake
+ - name: Install NASM (Windows)
+ if: runner.os == 'Windows'
+ run: |
+ choco install nasm -y
+ echo "C:\Program Files\NASM" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+
- name: Build
run: cargo build --release --target ${{ matrix.target }}
@@ -71,12 +79,22 @@ jobs:
# don't repeat that mistake. If a future binary gets renamed or
# removed, this step should scream, not quietly publish an
# incomplete release.
- cp target/${{ matrix.target }}/release/webclaw "$staging/"
- cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/"
- cp target/${{ matrix.target }}/release/webclaw-server "$staging/"
- cp README.md LICENSE "$staging/"
- tar czf "$staging.tar.gz" "$staging"
- echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV
+
+ if [[ "${{ matrix.os }}" == "windows-latest" ]]; then
+ cp target/${{ matrix.target }}/release/webclaw.exe "$staging/"
+ cp target/${{ matrix.target }}/release/webclaw-mcp.exe "$staging/"
+ cp target/${{ matrix.target }}/release/webclaw-server.exe "$staging/"
+ cp README.md LICENSE "$staging/"
+ 7z a -tzip "$staging.zip" "$staging"
+ echo "ASSET=$staging.zip" >> $GITHUB_ENV
+ else
+ cp target/${{ matrix.target }}/release/webclaw "$staging/"
+ cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/"
+ cp target/${{ matrix.target }}/release/webclaw-server "$staging/"
+ cp README.md LICENSE "$staging/"
+ tar czf "$staging.tar.gz" "$staging"
+ echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV
+ fi
- name: Upload artifact
uses: actions/upload-artifact@v4
@@ -99,7 +117,8 @@ jobs:
run: |
cd artifacts
find . -name '*.tar.gz' -exec mv {} . \;
- sha256sum *.tar.gz > SHA256SUMS
+ find . -name '*.zip' -exec mv {} . \;
+ sha256sum *.tar.gz *.zip > SHA256SUMS 2>/dev/null || sha256sum * > SHA256SUMS
cat SHA256SUMS
- name: Create GitHub Release
@@ -108,6 +127,7 @@ jobs:
generate_release_notes: true
files: |
artifacts/*.tar.gz
+ artifacts/*.zip
artifacts/SHA256SUMS
docker:
@@ -181,7 +201,7 @@ jobs:
tag="${GITHUB_REF#refs/tags/}"
base="https://github.com/0xMassi/webclaw/releases/download/${tag}"
- # Download all 4 tarballs and compute SHAs
+ # Download all tarballs (Linux + macOS) and compute SHAs
for target in aarch64-apple-darwin x86_64-apple-darwin aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu; do
curl -sSL "${base}/webclaw-${tag}-${target}.tar.gz" -o "${target}.tar.gz"
done
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 53f636f..4e2a0ee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,9 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
## [0.5.8] — 2026-05-04
+### Added
+- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`.
+
### Fixed
- Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise.
From 86183b11e4e4e8e695836a6b2b042f3df0994985 Mon Sep 17 00:00:00 2001
From: Valerio
Date: Tue, 5 May 2026 11:44:07 +0200
Subject: [PATCH 18/51] docs: credit Windows release contribution
---
CHANGELOG.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4e2a0ee..63d163f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,7 +6,7 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
## [0.5.8] — 2026-05-04
### Added
-- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`.
+- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. Thanks to Suryansh Mishra (`@notrealsuryansh`) for the contribution.
### Fixed
- Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise.
From a3aa4bce6f7a9a4d1b4d3e8bdb78edea75042a73 Mon Sep 17 00:00:00 2001
From: Valerio
Date: Wed, 6 May 2026 11:36:53 +0200
Subject: [PATCH 19/51] fix: support LLM provider compatibility options
Closes #36
---
CHANGELOG.md | 1 +
README.md | 3 +
crates/webclaw-cli/src/main.rs | 5 +-
crates/webclaw-llm/src/chain.rs | 2 +-
crates/webclaw-llm/src/providers/anthropic.rs | 61 +++++++-
crates/webclaw-llm/src/providers/openai.rs | 137 ++++++++++++++++--
6 files changed, 193 insertions(+), 16 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 63d163f..8e30acd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
### Added
- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. Thanks to Suryansh Mishra (`@notrealsuryansh`) for the contribution.
+- LLM providers now support `ANTHROPIC_BASE_URL` for Anthropic-compatible proxies, plus an `OPENAI_RESPONSE_FORMAT_TYPE` override for OpenAI-compatible backends such as LM Studio. Thanks to Toti (`@Toti330`) for the report.
### Fixed
- Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise.
diff --git a/README.md b/README.md
index 4362d35..79758f0 100644
--- a/README.md
+++ b/README.md
@@ -358,7 +358,10 @@ webclaw/
| `WEBCLAW_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) |
| `OLLAMA_HOST` | Ollama URL for local LLM features (default: `http://localhost:11434`) |
| `OPENAI_API_KEY` | OpenAI API key for LLM features |
+| `OPENAI_BASE_URL` | OpenAI-compatible base URL (default: `https://api.openai.com/v1`) |
+| `OPENAI_RESPONSE_FORMAT_TYPE` | JSON-mode response format for OpenAI-compatible backends: `json_object` (default), `json_schema`, or `text`. Use `text` or `json_schema` for LM Studio. |
| `ANTHROPIC_API_KEY` | Anthropic API key for LLM features |
+| `ANTHROPIC_BASE_URL` | Anthropic-compatible base URL (default: `https://api.anthropic.com/v1`) |
| `WEBCLAW_PROXY` | Single proxy URL |
| `WEBCLAW_PROXY_FILE` | Path to proxy pool file |
diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs
index e97f15d..a45bce8 100644
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@@ -260,7 +260,7 @@ struct Cli {
#[arg(long, env = "WEBCLAW_LLM_MODEL")]
llm_model: Option,
- /// Override the LLM base URL (Ollama or OpenAI-compatible)
+ /// Override the LLM base URL (Ollama, OpenAI-compatible, or Anthropic-compatible)
#[arg(long, env = "WEBCLAW_LLM_BASE_URL")]
llm_base_url: Option,
@@ -1919,8 +1919,9 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> {
Ok(Box::new(provider))
}
"anthropic" => {
- let provider = webclaw_llm::providers::anthropic::AnthropicProvider::new(
+ let provider = webclaw_llm::providers::anthropic::AnthropicProvider::with_base_url(
None,
+ cli.llm_base_url.clone(),
cli.llm_model.clone(),
)
.ok_or("ANTHROPIC_API_KEY not set")?;
diff --git a/crates/webclaw-llm/src/chain.rs b/crates/webclaw-llm/src/chain.rs
index 314bf2a..86b0101 100644
--- a/crates/webclaw-llm/src/chain.rs
+++ b/crates/webclaw-llm/src/chain.rs
@@ -34,7 +34,7 @@ impl ProviderChain {
providers.push(Box::new(openai));
}
- if let Some(anthropic) = AnthropicProvider::new(None, None) {
+ if let Some(anthropic) = AnthropicProvider::with_base_url(None, None, None) {
debug!("anthropic configured, adding to chain");
providers.push(Box::new(anthropic));
}
diff --git a/crates/webclaw-llm/src/providers/anthropic.rs b/crates/webclaw-llm/src/providers/anthropic.rs
index 71ca1f9..e6e43c8 100644
--- a/crates/webclaw-llm/src/providers/anthropic.rs
+++ b/crates/webclaw-llm/src/providers/anthropic.rs
@@ -10,23 +10,38 @@ use crate::provider::{CompletionRequest, LlmProvider};
use super::load_api_key;
-const ANTHROPIC_API_URL: &str = "https://api.anthropic.com/v1/messages";
+const DEFAULT_ANTHROPIC_BASE_URL: &str = "https://api.anthropic.com/v1";
const ANTHROPIC_VERSION: &str = "2023-06-01";
pub struct AnthropicProvider {
client: reqwest::Client,
key: String,
+ base_url: String,
default_model: String,
}
impl AnthropicProvider {
/// Returns `None` if no API key is available (param or env).
pub fn new(key_override: Option, model: Option) -> Option {
+ Self::with_base_url(key_override, None, model)
+ }
+
+ /// Returns `None` if no API key is available (param or env).
+ pub fn with_base_url(
+ key_override: Option,
+ base_url: Option,
+ model: Option,
+ ) -> Option {
let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?;
Some(Self {
client: reqwest::Client::new(),
key,
+ base_url: base_url
+ .or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok())
+ .unwrap_or_else(|| DEFAULT_ANTHROPIC_BASE_URL.into())
+ .trim_end_matches('/')
+ .to_string(),
default_model: model.unwrap_or_else(|| "claude-sonnet-4-20250514".into()),
})
}
@@ -34,6 +49,14 @@ impl AnthropicProvider {
pub fn default_model(&self) -> &str {
&self.default_model
}
+
+ fn messages_url(&self) -> String {
+ if self.base_url.ends_with("/messages") {
+ self.base_url.clone()
+ } else {
+ format!("{}/messages", self.base_url)
+ }
+ }
}
#[async_trait]
@@ -74,7 +97,7 @@ impl LlmProvider for AnthropicProvider {
let resp = self
.client
- .post(ANTHROPIC_API_URL)
+ .post(self.messages_url())
.header("x-api-key", &self.key)
.header("anthropic-version", ANTHROPIC_VERSION)
.header("content-type", "application/json")
@@ -135,6 +158,11 @@ mod tests {
assert_eq!(provider.name(), "anthropic");
assert_eq!(provider.default_model, "claude-sonnet-4-20250514");
assert_eq!(provider.key, "sk-ant-test");
+ assert_eq!(provider.base_url, "https://api.anthropic.com/v1");
+ assert_eq!(
+ provider.messages_url(),
+ "https://api.anthropic.com/v1/messages"
+ );
}
#[test]
@@ -151,6 +179,35 @@ mod tests {
assert_eq!(provider.default_model(), "claude-sonnet-4-20250514");
}
+ #[test]
+ fn custom_base_url_appends_messages_path() {
+ let provider = AnthropicProvider::with_base_url(
+ Some("sk-ant-test".into()),
+ Some("https://proxy.example.test/anthropic/v1/".into()),
+ None,
+ )
+ .unwrap();
+ assert_eq!(provider.base_url, "https://proxy.example.test/anthropic/v1");
+ assert_eq!(
+ provider.messages_url(),
+ "https://proxy.example.test/anthropic/v1/messages"
+ );
+ }
+
+ #[test]
+ fn custom_full_messages_url_is_not_doubled() {
+ let provider = AnthropicProvider::with_base_url(
+ Some("sk-ant-test".into()),
+ Some("https://proxy.example.test/v1/messages".into()),
+ None,
+ )
+ .unwrap();
+ assert_eq!(
+ provider.messages_url(),
+ "https://proxy.example.test/v1/messages"
+ );
+ }
+
// Env var fallback tests mutate process-global state and race with parallel tests.
// The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
diff --git a/crates/webclaw-llm/src/providers/openai.rs b/crates/webclaw-llm/src/providers/openai.rs
index 6422cc4..3780d8f 100644
--- a/crates/webclaw-llm/src/providers/openai.rs
+++ b/crates/webclaw-llm/src/providers/openai.rs
@@ -13,6 +13,50 @@ pub struct OpenAiProvider {
key: String,
base_url: String,
default_model: String,
+ response_format: OpenAiResponseFormat,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum OpenAiResponseFormat {
+ JsonObject,
+ JsonSchema,
+ Text,
+}
+
+impl OpenAiResponseFormat {
+ fn from_env() -> Self {
+ std::env::var("OPENAI_RESPONSE_FORMAT_TYPE")
+ .ok()
+ .and_then(|value| Self::parse(&value))
+ .unwrap_or(Self::JsonObject)
+ }
+
+ fn parse(value: &str) -> Option {
+ match value.trim().to_ascii_lowercase().as_str() {
+ "" | "json_object" => Some(Self::JsonObject),
+ "json_schema" => Some(Self::JsonSchema),
+ "text" => Some(Self::Text),
+ _ => None,
+ }
+ }
+
+ fn as_response_format(self) -> serde_json::Value {
+ match self {
+ Self::JsonObject => json!({ "type": "json_object" }),
+ Self::JsonSchema => json!({
+ "type": "json_schema",
+ "json_schema": {
+ "name": "webclaw_response",
+ "schema": {
+ "type": "object",
+ "additionalProperties": true
+ },
+ "strict": false
+ }
+ }),
+ Self::Text => json!({ "type": "text" }),
+ }
+ }
}
impl OpenAiProvider {
@@ -31,23 +75,15 @@ impl OpenAiProvider {
.or_else(|| std::env::var("OPENAI_BASE_URL").ok())
.unwrap_or_else(|| "https://api.openai.com/v1".into()),
default_model: model.unwrap_or_else(|| "gpt-4o-mini".into()),
+ response_format: OpenAiResponseFormat::from_env(),
})
}
pub fn default_model(&self) -> &str {
&self.default_model
}
-}
-
-#[async_trait]
-impl LlmProvider for OpenAiProvider {
- async fn complete(&self, request: &CompletionRequest) -> Result {
- let model = if request.model.is_empty() {
- &self.default_model
- } else {
- &request.model
- };
+ fn request_body(&self, request: &CompletionRequest, model: &str) -> serde_json::Value {
let messages: Vec = request
.messages
.iter()
@@ -60,7 +96,7 @@ impl LlmProvider for OpenAiProvider {
});
if request.json_mode {
- body["response_format"] = json!({ "type": "json_object" });
+ body["response_format"] = self.response_format.as_response_format();
}
if let Some(temp) = request.temperature {
body["temperature"] = json!(temp);
@@ -69,6 +105,21 @@ impl LlmProvider for OpenAiProvider {
body["max_tokens"] = json!(max);
}
+ body
+ }
+}
+
+#[async_trait]
+impl LlmProvider for OpenAiProvider {
+ async fn complete(&self, request: &CompletionRequest) -> Result {
+ let model = if request.model.is_empty() {
+ &self.default_model
+ } else {
+ &request.model
+ };
+
+ let body = self.request_body(request, model);
+
let url = format!("{}/chat/completions", self.base_url);
let resp = self
.client
@@ -136,6 +187,7 @@ mod tests {
assert_eq!(provider.default_model, "gpt-4o-mini");
assert_eq!(provider.base_url, "https://api.openai.com/v1");
assert_eq!(provider.key, "test-key-123");
+ assert_eq!(provider.response_format, OpenAiResponseFormat::JsonObject);
}
#[test]
@@ -161,6 +213,69 @@ mod tests {
assert_eq!(provider.default_model(), "gpt-4o-mini");
}
+ #[test]
+ fn json_mode_defaults_to_openai_json_object() {
+ let provider = OpenAiProvider::new(
+ Some("test-key".into()),
+ Some("https://api.openai.com/v1".into()),
+ None,
+ )
+ .unwrap();
+ let req = CompletionRequest {
+ model: String::new(),
+ messages: vec![],
+ temperature: None,
+ max_tokens: None,
+ json_mode: true,
+ };
+ let body = provider.request_body(&req, provider.default_model());
+ assert_eq!(body["response_format"], json!({ "type": "json_object" }));
+ }
+
+ #[test]
+ fn json_schema_response_format_for_compatible_backends() {
+ let mut provider = OpenAiProvider::new(
+ Some("test-key".into()),
+ Some("http://localhost:1234/v1".into()),
+ Some("local-model".into()),
+ )
+ .unwrap();
+ provider.response_format = OpenAiResponseFormat::JsonSchema;
+ let req = CompletionRequest {
+ model: String::new(),
+ messages: vec![],
+ temperature: None,
+ max_tokens: None,
+ json_mode: true,
+ };
+ let body = provider.request_body(&req, provider.default_model());
+ assert_eq!(body["response_format"]["type"], "json_schema");
+ assert_eq!(
+ body["response_format"]["json_schema"]["schema"]["type"],
+ "object"
+ );
+ }
+
+ #[test]
+ fn text_response_format_for_lm_studio() {
+ let mut provider = OpenAiProvider::new(
+ Some("test-key".into()),
+ Some("http://localhost:1234/v1".into()),
+ Some("local-model".into()),
+ )
+ .unwrap();
+ provider.response_format = OpenAiResponseFormat::Text;
+ let req = CompletionRequest {
+ model: String::new(),
+ messages: vec![],
+ temperature: None,
+ max_tokens: None,
+ json_mode: true,
+ };
+ let body = provider.request_body(&req, provider.default_model());
+ assert_eq!(body["response_format"], json!({ "type": "text" }));
+ }
+
// Env var fallback tests mutate process-global state and race with parallel tests.
// The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
From e6a95f783dd9eea4fe0b34bfc0e8f70bf3ff74f5 Mon Sep 17 00:00:00 2001
From: Valerio
Date: Wed, 6 May 2026 11:42:09 +0200
Subject: [PATCH 20/51] chore: bump version to 0.5.9
---
CHANGELOG.md | 8 +++++++-
Cargo.lock | 14 +++++++-------
Cargo.toml | 2 +-
3 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8e30acd..7858ae4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,11 +3,17 @@
All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/).
+## [0.5.9] — 2026-05-06
+
+### Fixed
+- LLM providers now support `ANTHROPIC_BASE_URL` for Anthropic-compatible proxies, plus an `OPENAI_RESPONSE_FORMAT_TYPE` override for OpenAI-compatible backends such as LM Studio. Thanks to Toti (`@Toti330`) for the report.
+
+---
+
## [0.5.8] — 2026-05-04
### Added
- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. Thanks to Suryansh Mishra (`@notrealsuryansh`) for the contribution.
-- LLM providers now support `ANTHROPIC_BASE_URL` for Anthropic-compatible proxies, plus an `OPENAI_RESPONSE_FORMAT_TYPE` override for OpenAI-compatible backends such as LM Studio. Thanks to Toti (`@Toti330`) for the report.
### Fixed
- Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise.
diff --git a/Cargo.lock b/Cargo.lock
index 4a6b90e..e49ccc3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3219,7 +3219,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
-version = "0.5.8"
+version = "0.5.9"
dependencies = [
"clap",
"dotenvy",
@@ -3240,7 +3240,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
-version = "0.5.8"
+version = "0.5.9"
dependencies = [
"ego-tree",
"once_cell",
@@ -3258,7 +3258,7 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
-version = "0.5.8"
+version = "0.5.9"
dependencies = [
"async-trait",
"bytes",
@@ -3284,7 +3284,7 @@ dependencies = [
[[package]]
name = "webclaw-llm"
-version = "0.5.8"
+version = "0.5.9"
dependencies = [
"async-trait",
"reqwest",
@@ -3297,7 +3297,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
-version = "0.5.8"
+version = "0.5.9"
dependencies = [
"dirs",
"dotenvy",
@@ -3317,7 +3317,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
-version = "0.5.8"
+version = "0.5.9"
dependencies = [
"pdf-extract",
"thiserror",
@@ -3326,7 +3326,7 @@ dependencies = [
[[package]]
name = "webclaw-server"
-version = "0.5.8"
+version = "0.5.9"
dependencies = [
"anyhow",
"axum",
diff --git a/Cargo.toml b/Cargo.toml
index f77595d..12a4b73 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"]
[workspace.package]
-version = "0.5.8"
+version = "0.5.9"
edition = "2024"
license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw"
From 7f7514395415484e0e9da3ad5178e0578917e09d Mon Sep 17 00:00:00 2001
From: Valerio
Date: Wed, 6 May 2026 17:16:35 +0200
Subject: [PATCH 21/51] docs: update hosted api trial copy
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 79758f0..7d936c6 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ It extracts clean, structured content from any URL using Chrome-level TLS finger
**Self-host.** Free, AGPL-3.0, runs locally. Get the CLI, MCP server, or REST API in one command. Ships with the 8 core extraction tools: scrape, crawl, map, batch, extract, summarize, diff, brand.
-**Hosted API** at **[webclaw.io](https://webclaw.io)**. 500 pages/month free, no card. Adds what self-hosting can't do alone: antibot bypass (Cloudflare, DataDome, WAF), JS rendering, async crawl/research jobs, web search, watches. For when you want it to *just work*.
+**Hosted API** at **[webclaw.io](https://webclaw.io)**. Start with a 7-day Starter trial, card required. Adds what self-hosting can't do alone: antibot bypass (Cloudflare, DataDome, WAF), JS rendering, async crawl/research jobs, web search, watches. For when you want it to *just work*.
---
From e8ca1417d699d977fd4d08af435758be127e7226 Mon Sep 17 00:00:00 2001
From: devnen
Date: Sun, 10 May 2026 15:11:12 +0200
Subject: [PATCH 22/51] Improve --format llm output quality (#37)
Improve LLM-format output for modern news and documentation pages.
- Filter noisy hydration and low-value page chrome structured data while preserving content-bearing Schema.org records
- Fix element/text spacing without detaching punctuation on docs, forums, and reference pages
- Remove common accessibility link chrome from LLM text and link labels
- Bump workspace version to 0.6.0 and update the changelog
Thanks to Nenad Oric (@devnen) for the original PR and contribution.
---
CHANGELOG.md | 9 ++
Cargo.lock | 14 +--
Cargo.toml | 2 +-
crates/webclaw-core/src/llm/body.rs | 3 +
crates/webclaw-core/src/llm/cleanup.rs | 83 ++++++++++++++
crates/webclaw-core/src/llm/links.rs | 25 +++++
crates/webclaw-core/src/llm/mod.rs | 148 ++++++++++++++++++++++++-
crates/webclaw-core/src/markdown.rs | 103 ++++++++++++++++-
8 files changed, 371 insertions(+), 16 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7858ae4..025b1db 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,15 @@
All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/).
+## [0.6.0] — 2026-05-10
+
+### Fixed
+- Improved `--format llm` output quality on modern news and documentation pages. Framework hydration blobs and low-value page chrome structured-data records are now filtered out before they can flood the LLM context, while content-bearing Schema.org records are preserved. Thanks and congrats to Nenad Oric (`@devnen`) for the contribution in PR #37.
+- Fixed element-to-text spacing so adjacent inline nodes no longer smash words together, while punctuation stays attached on real pages such as docs, forums, and reference sites.
+- Removed common screen-reader-only link chrome such as "opens new tab" from LLM body text and link labels without stripping ordinary prose that happens to mention external links.
+
+---
+
## [0.5.9] — 2026-05-06
### Fixed
diff --git a/Cargo.lock b/Cargo.lock
index e49ccc3..ab23a3f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3219,7 +3219,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
-version = "0.5.9"
+version = "0.6.0"
dependencies = [
"clap",
"dotenvy",
@@ -3240,7 +3240,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
-version = "0.5.9"
+version = "0.6.0"
dependencies = [
"ego-tree",
"once_cell",
@@ -3258,7 +3258,7 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
-version = "0.5.9"
+version = "0.6.0"
dependencies = [
"async-trait",
"bytes",
@@ -3284,7 +3284,7 @@ dependencies = [
[[package]]
name = "webclaw-llm"
-version = "0.5.9"
+version = "0.6.0"
dependencies = [
"async-trait",
"reqwest",
@@ -3297,7 +3297,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
-version = "0.5.9"
+version = "0.6.0"
dependencies = [
"dirs",
"dotenvy",
@@ -3317,7 +3317,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
-version = "0.5.9"
+version = "0.6.0"
dependencies = [
"pdf-extract",
"thiserror",
@@ -3326,7 +3326,7 @@ dependencies = [
[[package]]
name = "webclaw-server"
-version = "0.5.9"
+version = "0.6.0"
dependencies = [
"anyhow",
"axum",
diff --git a/Cargo.toml b/Cargo.toml
index 12a4b73..6e87225 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"]
[workspace.package]
-version = "0.5.9"
+version = "0.6.0"
edition = "2024"
license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw"
diff --git a/crates/webclaw-core/src/llm/body.rs b/crates/webclaw-core/src/llm/body.rs
index 5311121..db2a011 100644
--- a/crates/webclaw-core/src/llm/body.rs
+++ b/crates/webclaw-core/src/llm/body.rs
@@ -29,6 +29,9 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody {
// 0c. Strip leaked JavaScript (framework hydration, self.__wrap_n, etc.)
let text = cleanup::strip_leaked_js(&text);
+ // 0c2. Strip a11y link chrome ("opens new tab", external link hints)
+ let text = cleanup::strip_a11y_link_chrome(&text);
+
// 0d. Collapse spaced-out text (CSS animation artifacts like "S t a r t")
// Must run before any dedup -- spaced text confuses word-based dedup.
let text = cleanup::collapse_spaced_text(&text);
diff --git a/crates/webclaw-core/src/llm/cleanup.rs b/crates/webclaw-core/src/llm/cleanup.rs
index c8e14ed..dc447a5 100644
--- a/crates/webclaw-core/src/llm/cleanup.rs
+++ b/crates/webclaw-core/src/llm/cleanup.rs
@@ -146,6 +146,45 @@ pub(crate) fn strip_leaked_js(input: &str) -> String {
out
}
+// ---------------------------------------------------------------------------
+// Accessibility link chrome ("opens new tab", "external link")
+// ---------------------------------------------------------------------------
+
+/// Strip screen-reader-only link chrome that bleeds into rendered text.
+///
+/// Sites like Reuters wrap external/new-window links with hidden spans
+/// like `, opens new tab`. The noise
+/// filter can't reliably catch these (no consistent class hook across
+/// sites), so they end up duplicated all over the body text. This is a
+/// targeted text-level scrub of the most common phrasings.
+pub(crate) fn strip_a11y_link_chrome(input: &str) -> String {
+ static A11Y_PATTERN: Lazy = Lazy::new(|| {
+ Regex::new(
+ r"(?i)(?:\s*,\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?|\s+\((?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\)\.?|\s+external link\b\.?$)",
+ )
+ .unwrap()
+ });
+
+ let mut out = String::with_capacity(input.len());
+ let mut in_code_fence = false;
+ for (i, line) in input.lines().enumerate() {
+ if i > 0 {
+ out.push('\n');
+ }
+ if line.trim().starts_with("```") {
+ in_code_fence = !in_code_fence;
+ out.push_str(line);
+ continue;
+ }
+ if in_code_fence {
+ out.push_str(line);
+ continue;
+ }
+ out.push_str(&A11Y_PATTERN.replace_all(line, ""));
+ }
+ out
+}
+
// ---------------------------------------------------------------------------
// Spaced-out text collapsing (CSS animation artifacts)
// ---------------------------------------------------------------------------
@@ -1356,4 +1395,48 @@ mod tests {
let input = "```\nImage of something in code\n```";
assert_eq!(strip_alt_text_noise(input), input);
}
+
+ #[test]
+ fn a11y_strips_opens_new_tab() {
+ let input = "Download the App, opens new tab and Subscribe, opens new tab.";
+ let out = strip_a11y_link_chrome(input);
+ assert!(!out.to_lowercase().contains("opens new tab"), "leak: {out}");
+ assert!(out.contains("Download the App"));
+ assert!(out.contains("Subscribe"));
+ }
+
+ #[test]
+ fn a11y_strips_external_link_variants() {
+ let cases = [
+ ("Visit our docs, opens external link", "Visit our docs"),
+ ("Click here, opens in a new window.", "Click here"),
+ ("More info external link", "More info"),
+ ];
+ for (input, expected_prefix) in cases {
+ let out = strip_a11y_link_chrome(input);
+ assert!(
+ out.starts_with(expected_prefix),
+ "input={input:?} got={out:?}"
+ );
+ assert!(!out.to_lowercase().contains("opens"), "leak: {out}");
+ }
+ }
+
+ #[test]
+ fn a11y_preserves_code_blocks() {
+ let input = "```\nopens new tab is a function\n```\nDownload, opens new tab";
+ let out = strip_a11y_link_chrome(input);
+ assert!(
+ out.contains("opens new tab is a function"),
+ "code stripped: {out}"
+ );
+ // Outside the fence, the chrome is removed.
+ assert!(!out.to_lowercase().contains("download, opens new tab"));
+ }
+
+ #[test]
+ fn a11y_preserves_external_link_prose() {
+ let input = "Researchers found an external link between the two incidents.";
+ assert_eq!(strip_a11y_link_chrome(input), input);
+ }
}
diff --git a/crates/webclaw-core/src/llm/links.rs b/crates/webclaw-core/src/llm/links.rs
index 0656aac..3d25179 100644
--- a/crates/webclaw-core/src/llm/links.rs
+++ b/crates/webclaw-core/src/llm/links.rs
@@ -88,10 +88,19 @@ fn is_noise_link(text: &str, href: &str) -> bool {
static MD_MARKERS_RE: Lazy =
Lazy::new(|| Regex::new(r"#{1,6}\s+|\*{1,2}|_{1,2}|`").unwrap());
+static A11Y_LABEL_RE: Lazy = Lazy::new(|| {
+ Regex::new(
+ r"(?i)(?:\s*,?\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website))\b\.?|\s*,\s*external link\b\.?|\s+external link\b\.?$)",
+ )
+ .unwrap()
+});
+
/// Clean a link label: strip markdown, dedup repeated phrases, truncate.
pub(crate) fn clean_link_label(raw: &str) -> String {
// Strip markdown markers
let label = MD_MARKERS_RE.replace_all(raw, "").to_string();
+ // Strip a11y link chrome ("opens new tab", etc.)
+ let label = A11Y_LABEL_RE.replace_all(&label, "").to_string();
let label = label.split_whitespace().collect::>().join(" ");
// Dedup repeated phrases in label
@@ -181,4 +190,20 @@ mod tests {
assert!(is_noise_link("user", "https://hn.com/user?id=foo"));
assert!(!is_noise_link("Rust docs", "https://rust-lang.org"));
}
+
+ #[test]
+ fn link_label_preserves_external_link_prose() {
+ assert_eq!(
+ clean_link_label("Research found an external link between incidents"),
+ "Research found an external link between incidents"
+ );
+ }
+
+ #[test]
+ fn link_label_strips_terminal_external_link_chrome() {
+ assert_eq!(
+ clean_link_label("Reuters story external link"),
+ "Reuters story"
+ );
+ }
}
diff --git a/crates/webclaw-core/src/llm/mod.rs b/crates/webclaw-core/src/llm/mod.rs
index 126558f..bc65be6 100644
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@@ -46,15 +46,73 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
}
// -- 4. Structured data (NEXT_DATA, SvelteKit, JSON-LD) --
- if !result.structured_data.is_empty() {
- out.push_str("\n\n## Structured Data\n\n```json\n");
- out.push_str(&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default());
- out.push_str("\n```");
+ // Only emit useful items: Schema.org records with a meaningful @type,
+ // and only if the total serialized size stays under a budget. Framework
+ // hydration blobs (Next.js pageProps full of ad-targeting flags, build
+ // IDs, schedule paths) explode to hundreds of KB and drown the LLM in
+ // noise — drop them rather than ship them.
+ let useful: Vec<_> = result
+ .structured_data
+ .iter()
+ .filter(|v| is_useful_structured_data(v))
+ .cloned()
+ .collect();
+ if !useful.is_empty() {
+ let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
+ const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
+ if serialized.len() <= STRUCTURED_DATA_MAX_BYTES {
+ out.push_str("\n\n## Structured Data\n\n```json\n");
+ out.push_str(&serialized);
+ out.push_str("\n```");
+ }
}
out.trim().to_string()
}
+/// Decide whether a structured-data value carries content worth emitting.
+///
+/// Schema.org records with a recognizable content `@type` (Article, NewsArticle,
+/// Product, Recipe, FAQPage, HowTo, Event, Person, Organization, BreadcrumbList,
+/// VideoObject, JobPosting, etc.) are kept. Generic `WebSite` / `WebPage` /
+/// `ItemList` records and Next.js `pageProps`-style blobs without a useful
+/// `@type` are dropped — they're almost always navigation chrome or framework
+/// hydration state.
+fn is_useful_structured_data(v: &serde_json::Value) -> bool {
+ let Some(obj) = v.as_object() else {
+ // SvelteKit can emit compact arrays of page data. Keep those if they
+ // are small enough to be useful, while still dropping giant hydration
+ // arrays under the same budget as untyped objects.
+ if v.is_array() {
+ let serialized = serde_json::to_string(v).unwrap_or_default();
+ return serialized.len() <= 4 * 1024;
+ }
+ return false;
+ };
+ // JSON-LD: @type drives the decision.
+ if let Some(t) = obj.get("@type") {
+ let types: Vec = match t {
+ serde_json::Value::String(s) => vec![s.to_ascii_lowercase()],
+ serde_json::Value::Array(a) => a
+ .iter()
+ .filter_map(|x| x.as_str())
+ .map(str::to_ascii_lowercase)
+ .collect(),
+ _ => Vec::new(),
+ };
+ if types.is_empty() {
+ return false;
+ }
+ // Drop low-info chrome types.
+ const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"];
+ return types.iter().any(|t| !DROP_TYPES.iter().any(|d| t == d));
+ }
+ // Next.js pageProps / SvelteKit data without @type: keep only if compact.
+ // Anything over ~4KB is almost certainly hydration state, not content.
+ let serialized = serde_json::to_string(v).unwrap_or_default();
+ serialized.len() <= 4 * 1024
+}
+
// ---------------------------------------------------------------------------
// Integration tests that exercise the full pipeline through to_llm_text
// ---------------------------------------------------------------------------
@@ -700,4 +758,86 @@ mod tests {
assert!(out.contains("Some content"), "Content before lost: {out}");
assert!(out.contains("More content"), "Content after lost: {out}");
}
+
+ // -- Structured-data gating tests --
+
+ fn make_result_with_structured(values: Vec) -> ExtractionResult {
+ let mut r = make_result("# Body");
+ r.structured_data = values;
+ r
+ }
+
+ #[test]
+ fn structured_data_drops_chrome_types() {
+ // WebSite/WebPage records are framework chrome — should be dropped.
+ let r = make_result_with_structured(vec![serde_json::json!({
+ "@type": "WebSite",
+ "name": "Example",
+ "url": "https://example.com"
+ })]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ !out.contains("## Structured Data"),
+ "WebSite chrome leaked into output: {out}"
+ );
+ }
+
+ #[test]
+ fn structured_data_keeps_article_types() {
+ let r = make_result_with_structured(vec![serde_json::json!({
+ "@type": "NewsArticle",
+ "headline": "Big news",
+ "datePublished": "2026-05-10"
+ })]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ out.contains("## Structured Data"),
+ "NewsArticle dropped: {out}"
+ );
+ assert!(out.contains("Big news"));
+ }
+
+ #[test]
+ fn structured_data_drops_oversized_blob() {
+ // 32KB pageProps-style blob with no @type — should be dropped.
+ let big = "x".repeat(32 * 1024);
+ let r = make_result_with_structured(vec![serde_json::json!({
+ "buildId": "abc",
+ "isFallback": false,
+ "noise": big
+ })]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ !out.contains("## Structured Data"),
+ "Oversized untyped blob leaked: len={}",
+ out.len()
+ );
+ }
+
+ #[test]
+ fn structured_data_keeps_compact_untyped() {
+ // Small untyped record (e.g. a parsed pageProps with real content) — keep.
+ let r = make_result_with_structured(vec![serde_json::json!({
+ "title": "Hi",
+ "body": "small enough to keep"
+ })]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ out.contains("## Structured Data"),
+ "Compact untyped dropped: {out}"
+ );
+ }
+
+ #[test]
+ fn structured_data_keeps_compact_untyped_array() {
+ // SvelteKit can emit compact arrays rather than objects.
+ let r = make_result_with_structured(vec![serde_json::json!([
+ { "title": "Hi", "body": "small array item" }
+ ])]);
+ let out = to_llm_text(&r, None);
+ assert!(
+ out.contains("small array item"),
+ "Compact untyped array dropped: {out}"
+ );
+ }
}
diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs
index d0a2c23..2699166 100644
--- a/crates/webclaw-core/src/markdown.rs
+++ b/crates/webclaw-core/src/markdown.rs
@@ -320,6 +320,9 @@ fn children_to_md(
}
}
Node::Text(text) => {
+ if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) {
+ out.push(' ');
+ }
out.push_str(text);
}
_ => {}
@@ -350,6 +353,9 @@ fn inline_text(
}
}
Node::Text(text) => {
+ if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) {
+ out.push(' ');
+ }
out.push_str(text);
}
_ => {}
@@ -361,11 +367,65 @@ fn inline_text(
/// Check whether a space is needed between two adjacent chunks of output.
/// Returns true when the left side doesn't end with whitespace and the right
-/// side doesn't start with whitespace — i.e., two words would be mashed together.
+/// side doesn't start with whitespace, except around punctuation that should
+/// bind to the adjacent token.
fn needs_separator(left: &str, right: &str) -> bool {
- let l = left.as_bytes().last().copied().unwrap_or(b' ');
- let r = right.as_bytes().first().copied().unwrap_or(b' ');
- !l.is_ascii_whitespace() && !r.is_ascii_whitespace()
+ let l = left.chars().next_back().unwrap_or(' ');
+ let r = right.chars().next().unwrap_or(' ');
+
+ if l.is_whitespace() || r.is_whitespace() {
+ return false;
+ }
+
+ // Do not create "word ," / "word )" / "word 's" artifacts.
+ if is_closing_punctuation(r) {
+ return false;
+ }
+
+ // Do not create "( word" / "[ 1" artifacts.
+ if is_opening_punctuation(l) {
+ return false;
+ }
+
+ // Common inline-code suffixes: `Option`s, `x`'s. Treat them like a
+ // single token rather than separating the text node.
+ if matches!(l, '`' | ')') && starts_with_inline_code_suffix(right) {
+ return false;
+ }
+
+ true
+}
+
+fn starts_with_inline_code_suffix(s: &str) -> bool {
+ let trimmed = s.trim_start_matches(['*', '_']);
+ let mut chars = trimmed.chars();
+ let Some(first) = chars.next() else {
+ return false;
+ };
+
+ if matches!(first, '\'' | '’') {
+ return true;
+ }
+
+ if !matches!(first, 's' | 'S') {
+ return false;
+ }
+
+ match chars.next() {
+ None => true,
+ Some(c) => c.is_whitespace() || is_closing_punctuation(c) || matches!(c, '*' | '_'),
+ }
+}
+
+fn is_closing_punctuation(c: char) -> bool {
+ matches!(
+ c,
+ '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '’' | '"' | '”'
+ )
+}
+
+fn is_opening_punctuation(c: char) -> bool {
+ matches!(c, '(' | '[' | '{' | '"' | '“')
}
/// Collect raw text content (no markdown formatting).
@@ -1606,4 +1666,39 @@ mod tests {
"collapse_whitespace stripped 6-space indent: {output}"
);
}
+
+ #[test]
+ fn text_after_inline_element_keeps_separator() {
+ // Reuters-style markup: agoTanker crosses...
+ // The "ago" text node sits between two element children. Without a
+ // separator check on the Text branch, "ago" + "Tanker" would smash
+ // together as "agoTanker".
+ let html = r#"
- The fastest web scraper for AI agents.
- 67% fewer tokens. Sub-millisecond extraction. Zero browser overhead.
+ Turn websites into clean markdown, JSON, and LLM-ready context.
+ CLI, MCP server, REST API, and SDKs for AI agents and RAG pipelines.
---
-Your AI agent calls `fetch()` and gets a 403. Or 142KB of raw HTML that burns through your token budget. **webclaw fixes both.**
+Most web scraping tools give your agent one of two bad outputs:
-It extracts clean, structured content from any URL using Chrome-level TLS fingerprinting — no headless browser, no Selenium, no Puppeteer. Output is optimized for LLMs: **67% fewer tokens** than raw HTML, with metadata, links, and images preserved.
+- a blocked page, login wall, or empty app shell
+- raw HTML full of nav, scripts, styling, ads, and duplicated boilerplate
+[webclaw.io](https://webclaw.io) is the hosted web extraction API for webclaw. This repo contains the open-source CLI, MCP server, extraction engine, and self-hostable server.
+
+webclaw turns a URL into clean content your tools can actually use.
+
+```bash
+webclaw https://example.com --format markdown
```
- Raw HTML webclaw
-┌──────────────────────────────────┐ ┌──────────────────────────────────┐
-│