fix(security): prepare 0.6.1 hardening

Merge the 0.6.1 security hardening release candidate after local and CI verification.
2026-07-23 07:21:02 +02:00 · 2026-05-12 12:16:42 +02:00 · 2026-05-12 12:16:42 +02:00 · a629534490
commit a629534490
parent af96628dc9 fd2e75d509
12 changed files with 216 additions and 54 deletions
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -5,14 +5,15 @@ on:
    tags: ["v*"]

 permissions:
-  contents: write
-  packages: write
+  contents: read

 env:
  CARGO_TERM_COLOR: always

 jobs:
  build:
+    permissions:
+      contents: read
    name: Build ${{ matrix.target }}
    runs-on: ${{ matrix.os }}
    strategy:
@ -106,9 +107,9 @@ jobs:
    name: Release
    needs: build
    runs-on: ubuntu-latest
+    permissions:
+      contents: write
    steps:
-      - uses: actions/checkout@v4
-
      - uses: actions/download-artifact@v4
        with:
          path: artifacts
@ -122,18 +123,23 @@ jobs:
          cat SHA256SUMS

      - name: Create GitHub Release
-        uses: softprops/action-gh-release@v2
-        with:
-          generate_release_notes: true
-          files: |
-            artifacts/*.tar.gz
-            artifacts/*.zip
-            artifacts/SHA256SUMS
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          tag="${GITHUB_REF#refs/tags/}"
+          gh release create "$tag" \
+            artifacts/*.tar.gz \
+            artifacts/*.zip \
+            artifacts/SHA256SUMS \
+            --generate-notes

  docker:
    name: Docker
    needs: release
    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
    steps:
      - uses: actions/checkout@v4

@ -193,6 +199,8 @@ jobs:
    name: Update Homebrew
    needs: [release, docker]
    runs-on: ubuntu-latest
+    permissions:
+      contents: read
    steps:
      - name: Compute all checksums and update formula
        env:
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,17 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).

+## [0.6.1] — 2026-05-12
+
+### Fixed
+- Hardened URL safety across the CLI, MCP server, and self-hosted API paths so local and private network targets are rejected more consistently, including after DNS resolution and redirects.
+- Added a timeout around inline JavaScript data extraction so hostile pages cannot keep the extractor busy forever.
+- Tightened Amazon and eBay URL recognition so deceptive hosts are rejected while common international marketplaces still work.
+- Avoided unnecessary decoding work on large responses during bot-challenge detection.
+- Reduced release workflow token permissions so build jobs run with narrower GitHub access.
+
+---
+
 ## [0.6.0] — 2026-05-10

 ### Fixed
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3219,7 +3219,7 @@ dependencies = [

 [[package]]
 name = "webclaw-cli"
-version = "0.6.0"
+version = "0.6.1"
 dependencies = [
 "clap",
 "dotenvy",
@ -3240,7 +3240,7 @@ dependencies = [

 [[package]]
 name = "webclaw-core"
-version = "0.6.0"
+version = "0.6.1"
 dependencies = [
 "ego-tree",
 "once_cell",
@ -3258,7 +3258,7 @@ dependencies = [

 [[package]]
 name = "webclaw-fetch"
-version = "0.6.0"
+version = "0.6.1"
 dependencies = [
 "async-trait",
 "bytes",
@ -3284,7 +3284,7 @@ dependencies = [

 [[package]]
 name = "webclaw-llm"
-version = "0.6.0"
+version = "0.6.1"
 dependencies = [
 "async-trait",
 "reqwest",
@ -3297,7 +3297,7 @@ dependencies = [

 [[package]]
 name = "webclaw-mcp"
-version = "0.6.0"
+version = "0.6.1"
 dependencies = [
 "dirs",
 "dotenvy",
@ -3317,7 +3317,7 @@ dependencies = [

 [[package]]
 name = "webclaw-pdf"
-version = "0.6.0"
+version = "0.6.1"
 dependencies = [
 "pdf-extract",
 "thiserror",
@ -3326,7 +3326,7 @@ dependencies = [

 [[package]]
 name = "webclaw-server"
-version = "0.6.0"
+version = "0.6.1"
 dependencies = [
 "anyhow",
 "axum",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]

 [workspace.package]
-version = "0.6.0"
+version = "0.6.1"
 edition = "2024"
 license = "AGPL-3.0"
 repository = "https://github.com/0xMassi/webclaw"
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -849,11 +849,18 @@ async fn enrich_html_with_stylesheets(html: &str, base_url: &str) -> String {

    let client = reqwest::Client::builder()
        .timeout(std::time::Duration::from_secs(5))
+        .redirect(reqwest::redirect::Policy::none())
        .build()
        .unwrap_or_default();

    let mut extra_css = String::new();
    for href in &hrefs {
+        if webclaw_fetch::url_security::validate_public_http_url(href)
+            .await
+            .is_err()
+        {
+            continue;
+        }
        if let Ok(resp) = client.get(href).send().await
            && resp.status().is_success()
            && let Ok(body) = resp.text().await
--- a/crates/webclaw-core/src/js_eval.rs
+++ b/crates/webclaw-core/src/js_eval.rs
@ -9,10 +9,12 @@ use once_cell::sync::Lazy;
 use regex::Regex;
 use rquickjs::{Context, Runtime};
 use scraper::{Html, Selector};
+use std::time::{Duration, Instant};
 use tracing::debug;

 static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").unwrap());
 static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
+const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);

 /// A blob of data extracted from JS execution.
 pub struct JsDataBlob {
@ -49,6 +51,8 @@ pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
    let rt = Runtime::new().expect("QuickJS runtime creation failed");
    rt.set_memory_limit(64 * 1024 * 1024); // 64 MB
    rt.set_max_stack_size(1024 * 1024); // 1 MB
+    let deadline = Instant::now() + JS_EVAL_TIMEOUT;
+    rt.set_interrupt_handler(Some(Box::new(move || Instant::now() >= deadline)));

    let ctx = Context::full(&rt).expect("QuickJS context creation failed");

@ -464,6 +468,8 @@ fn walk_rsc_tree(value: &serde_json::Value, out: &mut Vec<String>, depth: usize)

 #[cfg(test)]
 mod tests {
+    use std::time::{Duration, Instant};
+
    use super::*;

    #[test]
@ -493,6 +499,29 @@ mod tests {
        );
    }

+    #[test]
+    fn js_eval_interrupts_infinite_loops() {
+        let html = r#"
+            <html>
+              <head>
+                <script>
+                  while (true) {}
+                </script>
+              </head>
+              <body>hello</body>
+            </html>
+        "#;
+
+        let start = Instant::now();
+        let blobs = extract_js_data(html);
+
+        assert!(blobs.is_empty());
+        assert!(
+            start.elapsed() < Duration::from_secs(2),
+            "QuickJS execution should be interrupted quickly"
+        );
+    }
+
    #[test]
    fn skips_external_and_module_scripts() {
        let html = r#"<html><body>
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -783,6 +783,10 @@ fn is_pdf_content_type(headers: &http::HeaderMap) -> bool {

 /// Detect if a response looks like a bot protection challenge page.
 fn is_challenge_response(response: &Response) -> bool {
+    let body_len = response.body().len();
+    if body_len > 15_000 || body_len == 0 {
+        return false;
+    }
    is_challenge_html(response.text().as_ref())
 }

--- a/crates/webclaw-fetch/src/extractors/amazon_product.rs
+++ b/crates/webclaw-fetch/src/extractors/amazon_product.rs
@ -30,6 +30,7 @@ use std::sync::OnceLock;

 use regex::Regex;
 use serde_json::{Value, json};
+use url::Url;

 use super::ExtractorInfo;
 use crate::cloud::{self, CloudError};
@ -52,8 +53,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
 };

 pub fn matches(url: &str) -> bool {
-    let host = host_of(url);
-    if !is_amazon_host(host) {
+    let Some(host) = host_of(url) else {
+        return false;
+    };
+    if !is_amazon_host(&host) {
        return false;
    }
    parse_asin(url).is_some()
@ -162,17 +165,41 @@ pub fn parse(html: &str, url: &str, asin: &str) -> Value {
 // URL helpers
 // ---------------------------------------------------------------------------

-fn host_of(url: &str) -> &str {
-    url.split("://")
-        .nth(1)
-        .unwrap_or(url)
-        .split('/')
-        .next()
-        .unwrap_or("")
+fn host_of(url: &str) -> Option<String> {
+    let parsed = Url::parse(url).ok()?;
+    if !parsed.username().is_empty() || parsed.password().is_some() {
+        return None;
+    }
+    parsed.host_str().map(|host| host.to_ascii_lowercase())
 }

 fn is_amazon_host(host: &str) -> bool {
-    host.starts_with("www.amazon.") || host.starts_with("amazon.")
+    const AMAZON_HOSTS: &[&str] = &[
+        "amazon.ae",
+        "amazon.ca",
+        "amazon.cn",
+        "amazon.co.jp",
+        "amazon.co.uk",
+        "amazon.com",
+        "amazon.com.au",
+        "amazon.com.be",
+        "amazon.com.br",
+        "amazon.com.mx",
+        "amazon.com.tr",
+        "amazon.de",
+        "amazon.eg",
+        "amazon.es",
+        "amazon.fr",
+        "amazon.in",
+        "amazon.it",
+        "amazon.nl",
+        "amazon.pl",
+        "amazon.sa",
+        "amazon.se",
+        "amazon.sg",
+    ];
+    let normalized = host.strip_prefix("www.").unwrap_or(host);
+    AMAZON_HOSTS.contains(&normalized)
 }

 /// Pull a 10-char ASIN out of any recognised Amazon URL shape:
@ -347,6 +374,9 @@ mod tests {
        assert!(matches("https://www.amazon.com/dp/B0CHX1W1XY"));
        assert!(matches("https://www.amazon.co.uk/dp/B0CHX1W1XY/"));
        assert!(matches("https://www.amazon.de/dp/B0CHX1W1XY?psc=1"));
+        assert!(matches("https://www.amazon.ca/dp/B0CHX1W1XY"));
+        assert!(matches("https://www.amazon.com.au/dp/B0CHX1W1XY"));
+        assert!(matches("https://www.amazon.in/dp/B0CHX1W1XY"));
        assert!(matches(
            "https://www.amazon.com/gp/product/B0CHX1W1XY/ref=foo"
        ));
@ -357,6 +387,8 @@ mod tests {
        assert!(!matches("https://www.amazon.com/"));
        assert!(!matches("https://www.amazon.com/gp/cart"));
        assert!(!matches("https://example.com/dp/B0CHX1W1XY"));
+        assert!(!matches("https://www.amazon.com@127.0.0.1/dp/B0CHX1W1XY"));
+        assert!(!matches("https://www.amazon.evil.com/dp/B0CHX1W1XY"));
    }

    #[test]
--- a/crates/webclaw-fetch/src/extractors/ebay_listing.rs
+++ b/crates/webclaw-fetch/src/extractors/ebay_listing.rs
@ -12,6 +12,7 @@ use std::sync::OnceLock;

 use regex::Regex;
 use serde_json::{Value, json};
+use url::Url;

 use super::ExtractorInfo;
 use crate::cloud::{self, CloudError};
@ -32,8 +33,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
 };

 pub fn matches(url: &str) -> bool {
-    let host = host_of(url);
-    if !is_ebay_host(host) {
+    let Some(host) = host_of(url) else {
+        return false;
+    };
+    if !is_ebay_host(&host) {
        return false;
    }
    parse_item_id(url).is_some()
@ -120,17 +123,37 @@ pub fn parse(html: &str, url: &str, item_id: &str) -> Value {
 // URL helpers
 // ---------------------------------------------------------------------------

-fn host_of(url: &str) -> &str {
-    url.split("://")
-        .nth(1)
-        .unwrap_or(url)
-        .split('/')
-        .next()
-        .unwrap_or("")
+fn host_of(url: &str) -> Option<String> {
+    let parsed = Url::parse(url).ok()?;
+    if !parsed.username().is_empty() || parsed.password().is_some() {
+        return None;
+    }
+    parsed.host_str().map(|host| host.to_ascii_lowercase())
 }

 fn is_ebay_host(host: &str) -> bool {
-    host.starts_with("www.ebay.") || host.starts_with("ebay.")
+    const EBAY_HOSTS: &[&str] = &[
+        "ebay.at",
+        "ebay.be",
+        "ebay.ca",
+        "ebay.ch",
+        "ebay.co.uk",
+        "ebay.com",
+        "ebay.com.au",
+        "ebay.com.hk",
+        "ebay.com.my",
+        "ebay.com.sg",
+        "ebay.de",
+        "ebay.es",
+        "ebay.fr",
+        "ebay.ie",
+        "ebay.it",
+        "ebay.nl",
+        "ebay.ph",
+        "ebay.pl",
+    ];
+    let normalized = host.strip_prefix("www.").unwrap_or(host);
+    EBAY_HOSTS.contains(&normalized)
 }

 /// Pull the numeric item id out of `/itm/{id}` or `/itm/{slug}/{id}`
@ -273,9 +296,14 @@ mod tests {
            "https://www.ebay.com/itm/vintage-typewriter/325478156234"
        ));
        assert!(matches("https://www.ebay.co.uk/itm/325478156234"));
+        assert!(matches("https://www.ebay.ca/itm/325478156234"));
+        assert!(matches("https://www.ebay.com.au/itm/325478156234"));
+        assert!(matches("https://www.ebay.es/itm/325478156234"));
        assert!(!matches("https://www.ebay.com/"));
        assert!(!matches("https://www.ebay.com/sch/foo"));
        assert!(!matches("https://example.com/itm/325478156234"));
+        assert!(!matches("https://www.ebay.com@127.0.0.1/itm/325478156234"));
+        assert!(!matches("https://www.ebay.attacker.com/itm/325478156234"));
    }

    #[test]
--- a/crates/webclaw-fetch/src/tls.rs
+++ b/crates/webclaw-fetch/src/tls.rs
@ -5,9 +5,7 @@
 //! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order,
 //! stream dependency, priorities) to match real browser fingerprints.

-use std::time::Duration;
-
-use std::borrow::Cow;
+use std::{borrow::Cow, io, time::Duration};

 use wreq::http2::{
    Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
@ -21,6 +19,41 @@ use wreq::{Client, Emulation};
 use crate::browser::BrowserVariant;
 use crate::error::FetchError;

+#[derive(Clone, Default)]
+struct PublicDnsResolver;
+
+impl wreq::dns::Resolve for PublicDnsResolver {
+    fn resolve(&self, name: wreq::dns::Name) -> wreq::dns::Resolving {
+        Box::pin(async move {
+            let addrs = tokio::net::lookup_host((name.as_str(), 0))
+                .await
+                .map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)?;
+            let mut public = Vec::new();
+
+            for addr in addrs {
+                if crate::url_security::is_blocked_ip(addr.ip()) {
+                    let err: Box<dyn std::error::Error + Send + Sync> = Box::new(io::Error::new(
+                        io::ErrorKind::PermissionDenied,
+                        "DNS resolved to a blocked private or internal address",
+                    ));
+                    return Err(err);
+                }
+                public.push(addr);
+            }
+
+            if public.is_empty() {
+                let err: Box<dyn std::error::Error + Send + Sync> = Box::new(io::Error::new(
+                    io::ErrorKind::NotFound,
+                    "host did not resolve to any addresses",
+                ));
+                return Err(err);
+            }
+
+            Ok(Box::new(public.into_iter()) as wreq::dns::Addrs)
+        })
+    }
+}
+
 /// Chrome cipher list (TLS 1.3 + TLS 1.2 in Chrome's exact order).
 const CHROME_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";

@ -503,6 +536,8 @@ pub fn build_client(
        let proxy =
            wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
        builder = builder.proxy(proxy);
+    } else {
+        builder = builder.dns_resolver(PublicDnsResolver);
    }

    builder
--- a/crates/webclaw-fetch/src/url_security.rs
+++ b/crates/webclaw-fetch/src/url_security.rs
@ -163,7 +163,9 @@ mod tests {
            Ipv4Addr::new(169, 254, 169, 254),
            Ipv4Addr::new(172, 16, 0, 1),
            Ipv4Addr::new(192, 168, 0, 1),
+            Ipv4Addr::new(192, 0, 0, 8),
            Ipv4Addr::new(198, 18, 0, 1),
+            Ipv4Addr::new(255, 255, 255, 255),
        ] {
            let url = format!("http://{ip}/");
            assert!(validate_public_http_url(&url).await.is_err(), "{ip}");
@ -193,4 +195,9 @@ mod tests {
        );
        assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
    }
+
+    #[tokio::test]
+    async fn blocks_localhost_domains_after_resolution() {
+        assert!(validate_public_http_url("http://localhost/").await.is_err());
+    }
 }
--- a/crates/webclaw-mcp/src/server.rs
+++ b/crates/webclaw-mcp/src/server.rs
@ -51,9 +51,10 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
    }
 }

-/// Validate that a URL is non-empty and has an http or https scheme.
-fn validate_url(url: &str) -> Result<(), String> {
-    webclaw_fetch::url_security::validate_http_url(url)
+/// Validate that a URL is public HTTP(S), matching the fetch-layer SSRF guard.
+async fn validate_url(url: &str) -> Result<(), String> {
+    webclaw_fetch::url_security::validate_public_http_url(url)
+        .await
        .map(|_| ())
        .map_err(|e| format!("Invalid URL: {e}"))
 }
@ -161,7 +162,7 @@ impl WebclawMcp {
    /// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected.
    #[tool]
    async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;
        let format = params.format.as_deref().unwrap_or("markdown");
        let browser = parse_browser(params.browser.as_deref());
        let include = params.include_selectors.unwrap_or_default();
@ -251,7 +252,7 @@ impl WebclawMcp {
    /// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit.
    #[tool]
    async fn crawl(&self, Parameters(params): Parameters<CrawlParams>) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;

        if let Some(max) = params.max_pages
            && max > 500
@ -300,7 +301,7 @@ impl WebclawMcp {
    /// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml).
    #[tool]
    async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;
        let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, &params.url)
            .await
            .map_err(|e| format!("Sitemap discovery failed: {e}"))?;
@ -323,7 +324,7 @@ impl WebclawMcp {
            return Err("batch is limited to 100 URLs per request".into());
        }
        for u in &params.urls {
-            validate_url(u)?;
+            validate_url(u).await?;
        }

        let format = params.format.as_deref().unwrap_or("markdown");
@ -365,7 +366,7 @@ impl WebclawMcp {
        &self,
        Parameters(params): Parameters<ExtractParams>,
    ) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;

        if params.schema.is_none() && params.prompt.is_none() {
            return Err("Either 'schema' or 'prompt' is required for extraction.".into());
@ -422,7 +423,7 @@ impl WebclawMcp {
        &self,
        Parameters(params): Parameters<SummarizeParams>,
    ) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;

        // No local LLM — fall back to cloud API directly
        if self.llm_chain.is_none() {
@ -464,7 +465,7 @@ impl WebclawMcp {
    /// Automatically falls back to the webclaw cloud API when bot protection is detected.
    #[tool]
    async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;
        let previous: webclaw_core::ExtractionResult =
            serde_json::from_str(&params.previous_snapshot)
                .map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
@ -532,7 +533,7 @@ impl WebclawMcp {
    /// Automatically falls back to the webclaw cloud API when bot protection is detected.
    #[tool]
    async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;
        let fetch_result =
            tokio::time::timeout(LOCAL_FETCH_TIMEOUT, self.fetch_client.fetch(&params.url))
                .await
@ -737,7 +738,7 @@ impl WebclawMcp {
        &self,
        Parameters(params): Parameters<VerticalParams>,
    ) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;
        // Use the cached Firefox client, not the default Chrome one.
        // Reddit's `.json` endpoint rejects the wreq-Chrome TLS
        // fingerprint with a 403 even from residential IPs (they