fix: harden fetch URL validation

2026-07-23 07:21:02 +02:00 · 2026-05-04 11:50:57 +02:00 · 2026-05-04 11:50:57 +02:00 · bdf81fe6bf
commit bdf81fe6bf
parent 23544f8fac
10 changed files with 284 additions and 27 deletions
--- a/8
+++ b/8
@ -73,11 +73,9 @@ COPY --from=builder /build/target/release/webclaw-server /usr/local/bin/webclaw-
 # as documentation; callers still need `-p 3000:3000` on `docker run`.
 EXPOSE 3000
-# Container default: bind all interfaces so `-p 3000:3000` works. The binary
+# Container default: bind all interfaces so `-p 3000:3000` works. Public
-# itself defaults to 127.0.0.1 (safe for `cargo run` on a laptop); inside
+# binding requires WEBCLAW_API_KEY; the binary refuses open-auth 0.0.0.0
-# Docker that would make the server unreachable, so we flip it here.
+# unless WEBCLAW_ALLOW_OPEN_PUBLIC=1 is set explicitly for local testing.
 # Override with -e WEBCLAW_HOST=127.0.0.1 if you front this with another
 # process in the same container.
 ENV WEBCLAW_HOST=0.0.0.0
 # Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -199,6 +199,8 @@ impl FetchClient {
                        config.timeout,
                        &config.headers,
                        config.proxy.as_deref(),
                        config.follow_redirects,
                        config.max_redirects,
                    )
                })
                .collect::<Result<Vec<_>, _>>()?;
@ -218,7 +220,14 @@ impl FetchClient {
                .iter()
                .map(|proxy| {
                    let v = *variants.choose(&mut rng).unwrap();
-                    crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy))
+                    crate::tls::build_client(
                        v,
                        config.timeout,
                        &config.headers,
                        Some(proxy),
                        config.follow_redirects,
                        config.max_redirects,
                    )
                })
                .collect::<Result<Vec<_>, _>>()?;
@ -379,6 +388,8 @@ impl FetchClient {
        url: &str,
        extra: &[(&str, &str)],
    ) -> Result<FetchResult, FetchError> {
        let parsed_url = crate::url_security::validate_public_http_url(url).await?;
        let url = parsed_url.as_str();
        let start = Instant::now();
        let client = self.pick_client(url);
@ -463,13 +474,17 @@ impl FetchClient {
        url: &str,
        options: &webclaw_core::ExtractionOptions,
    ) -> Result<webclaw_core::ExtractionResult, FetchError> {
        let parsed_url = crate::url_security::validate_public_http_url(url).await?;
        let url = parsed_url.as_str();
        // Reddit fallback: use their JSON API to get post + full comment tree.
        if crate::reddit::is_reddit_url(url) {
            let json_url = crate::reddit::json_url(url);
            let json_url = crate::url_security::validate_public_http_url(&json_url).await?;
            debug!("reddit detected, fetching {json_url}");
            let client = self.pick_client(url);
-            let resp = client.get(&json_url).send().await?;
+            let resp = client.get(json_url.as_str()).send().await?;
            let response = Response::from_wreq(resp).await?;
            if response.is_success() {
                let bytes = response.body();
@ -491,7 +506,7 @@ impl FetchClient {
            && let Some(homepage) = extract_homepage(url)
        {
            debug!("challenge detected, warming cookies via {homepage}");
-            let _ = client.get(&homepage).send().await;
+            let _ = self.fetch(&homepage).await;
            let resp = client.get(url).send().await?;
            response = Response::from_wreq(resp).await?;
            debug!("retried after cookie warmup: status={}", response.status());
--- a/crates/webclaw-fetch/src/lib.rs
+++ b/crates/webclaw-fetch/src/lib.rs
@ -15,6 +15,7 @@ pub mod proxy;
 pub mod reddit;
 pub mod sitemap;
 pub mod tls;
 pub mod url_security;
 pub use browser::BrowserProfile;
 pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
--- a/crates/webclaw-fetch/src/tls.rs
+++ b/crates/webclaw-fetch/src/tls.rs
@ -455,6 +455,8 @@ pub fn build_client(
    timeout: Duration,
    extra_headers: &std::collections::HashMap<String, String>,
    proxy: Option<&str>,
    follow_redirects: bool,
    max_redirects: u32,
 ) -> Result<Client, FetchError> {
    // SafariIos26 builds its Emulation on top of wreq-util's base instead
    // of from scratch. See `safari_ios_emulation` for why.
@ -490,7 +492,10 @@ pub fn build_client(
    let mut builder = Client::builder()
        .emulation(emulation)
-        .redirect(wreq::redirect::Policy::limited(10))
+        .redirect(ssrf_safe_redirect_policy(
            follow_redirects,
            max_redirects as usize,
        ))
        .cookie_store(true)
        .timeout(timeout);
@ -504,3 +509,26 @@ pub fn build_client(
        .build()
        .map_err(|e| FetchError::Build(e.to_string()))
 }
 fn ssrf_safe_redirect_policy(
    follow_redirects: bool,
    max_redirects: usize,
 ) -> wreq::redirect::Policy {
    if !follow_redirects {
        return wreq::redirect::Policy::none();
    }
    wreq::redirect::Policy::custom(move |attempt| {
        if attempt.previous.len() > max_redirects {
            return attempt.error("too many redirects");
        }
        attempt.pending(|attempt| async move {
            let next_url = attempt.uri.to_string();
            match crate::url_security::validate_public_http_url(&next_url).await {
                Ok(_) => attempt.follow(),
                Err(e) => attempt.error(e.to_string()),
            }
        })
    })
 }
--- a/crates/webclaw-fetch/src/url_security.rs
+++ b/crates/webclaw-fetch/src/url_security.rs
@ -0,0 +1,196 @@
 //! SSRF guard for every server-side fetch.
 //!
 //! Callers may still do cheap parse validation at the edge, but this
 //! module is the fetch-layer authority because redirects and helper
 //! fetches also pass through it.
 use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
 use tokio::net::lookup_host;
 use url::{Host, Url};
 use crate::error::FetchError;
 /// Parse a caller-provided URL and require an HTTP(S) host.
 pub fn validate_http_url(raw: &str) -> Result<Url, FetchError> {
    let trimmed = raw.trim();
    if trimmed.is_empty() {
        return Err(FetchError::InvalidUrl("URL must not be empty".into()));
    }
    let parsed =
        Url::parse(trimmed).map_err(|e| FetchError::InvalidUrl(format!("invalid URL: {e}")))?;
    match parsed.scheme() {
        "http" | "https" => {}
        scheme => {
            return Err(FetchError::InvalidUrl(format!(
                "scheme '{scheme}' is not allowed, use http:// or https://"
            )));
        }
    }
    if parsed.host().is_none() {
        return Err(FetchError::InvalidUrl("URL must include a host".into()));
    }
    Ok(parsed)
 }
 /// Parse, resolve, and reject private/internal destinations.
 ///
 /// A domain is rejected if any resolved address is private or reserved.
 /// That is intentionally conservative: mixed public/private DNS answers
 /// are unsafe for server-side fetching.
 pub async fn validate_public_http_url(raw: &str) -> Result<Url, FetchError> {
    let parsed = validate_http_url(raw)?;
    validate_url_host_is_public(&parsed).await?;
    Ok(parsed)
 }
 async fn validate_url_host_is_public(url: &Url) -> Result<(), FetchError> {
    match url.host() {
        Some(Host::Ipv4(ip)) => reject_blocked_ip(IpAddr::V4(ip)),
        Some(Host::Ipv6(ip)) => reject_blocked_ip(IpAddr::V6(ip)),
        Some(Host::Domain(host)) => {
            let port = url
                .port_or_known_default()
                .ok_or_else(|| FetchError::InvalidUrl("URL must include a known port".into()))?;
            let addrs = lookup_host((host, port))
                .await
                .map_err(|e| FetchError::InvalidUrl(format!("failed to resolve host: {e}")))?;
            let mut resolved = false;
            for addr in addrs {
                resolved = true;
                reject_blocked_ip(addr.ip())?;
            }
            if !resolved {
                return Err(FetchError::InvalidUrl(
                    "host did not resolve to any addresses".into(),
                ));
            }
            Ok(())
        }
        None => Err(FetchError::InvalidUrl("URL must include a host".into())),
    }
 }
 fn reject_blocked_ip(ip: IpAddr) -> Result<(), FetchError> {
    if is_blocked_ip(ip) {
        Err(FetchError::InvalidUrl(
            "URL resolves to a blocked private or internal address".into(),
        ))
    } else {
        Ok(())
    }
 }
 /// Return true for IP ranges that should never be fetched server-side.
 pub fn is_blocked_ip(ip: IpAddr) -> bool {
    match ip {
        IpAddr::V4(ip) => is_blocked_ipv4(ip),
        IpAddr::V6(ip) => is_blocked_ipv6(ip),
    }
 }
 fn is_blocked_ipv4(ip: Ipv4Addr) -> bool {
    let o = ip.octets();
    ip.is_unspecified()
        || ip.is_loopback()
        || ip.is_private()
        || ip.is_link_local()
        || o[0] == 0
        || o[0] >= 224
        || (o[0] == 100 && (64..=127).contains(&o[1]))
        || (o[0] == 192 && o[1] == 0 && o[2] == 0)
        || (o[0] == 192 && o[1] == 0 && o[2] == 2)
        || (o[0] == 198 && (18..=19).contains(&o[1]))
        || (o[0] == 198 && o[1] == 51 && o[2] == 100)
        || (o[0] == 203 && o[1] == 0 && o[2] == 113)
 }
 fn is_blocked_ipv6(ip: Ipv6Addr) -> bool {
    let s = ip.segments();
    ip.is_unspecified()
        || ip.is_loopback()
        || ip.is_multicast()
        || (s[0] & 0xfe00) == 0xfc00
        || (s[0] & 0xffc0) == 0xfe80
        || (s[0] == 0x0064 && s[1] == 0xff9b && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0)
        || (s[0] == 0x2001 && s[1] == 0x0db8)
        || embedded_ipv4(ip).is_some_and(is_blocked_ipv4)
 }
 fn embedded_ipv4(ip: Ipv6Addr) -> Option<Ipv4Addr> {
    let s = ip.segments();
    if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0xffff {
        return Some(Ipv4Addr::new(
            (s[6] >> 8) as u8,
            s[6] as u8,
            (s[7] >> 8) as u8,
            s[7] as u8,
        ));
    }
    if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0 {
        return Some(Ipv4Addr::new(
            (s[6] >> 8) as u8,
            s[6] as u8,
            (s[7] >> 8) as u8,
            s[7] as u8,
        ));
    }
    None
 }
 #[cfg(test)]
 mod tests {
    use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
    use super::{is_blocked_ip, validate_public_http_url};
    #[tokio::test]
    async fn blocks_ipv4_internal_ranges() {
        for ip in [
            Ipv4Addr::new(0, 0, 0, 0),
            Ipv4Addr::new(10, 0, 0, 1),
            Ipv4Addr::new(100, 64, 0, 1),
            Ipv4Addr::new(127, 0, 0, 1),
            Ipv4Addr::new(169, 254, 169, 254),
            Ipv4Addr::new(172, 16, 0, 1),
            Ipv4Addr::new(192, 168, 0, 1),
            Ipv4Addr::new(198, 18, 0, 1),
        ] {
            let url = format!("http://{ip}/");
            assert!(validate_public_http_url(&url).await.is_err(), "{ip}");
        }
    }
    #[tokio::test]
    async fn blocks_ipv6_internal_ranges() {
        for ip in [
            Ipv6Addr::LOCALHOST,
            Ipv6Addr::UNSPECIFIED,
            "fc00::1".parse().unwrap(),
            "fe80::1".parse().unwrap(),
            "64:ff9b::7f00:1".parse().unwrap(),
            "::ffff:127.0.0.1".parse().unwrap(),
        ] {
            assert!(is_blocked_ip(IpAddr::V6(ip)), "{ip}");
        }
    }
    #[tokio::test]
    async fn allows_public_ip_literals() {
        assert!(
            validate_public_http_url("https://93.184.216.34/")
                .await
                .is_ok()
        );
        assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
    }
 }
--- a/crates/webclaw-mcp/src/server.rs
+++ b/crates/webclaw-mcp/src/server.rs
@ -13,7 +13,6 @@ use rmcp::model::{Implementation, ServerCapabilities, ServerInfo};
 use rmcp::{ServerHandler, tool, tool_handler, tool_router};
 use serde_json::json;
 use tracing::{error, info, warn};
 use url::Url;
 use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult};
@ -54,19 +53,9 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
 /// Validate that a URL is non-empty and has an http or https scheme.
 fn validate_url(url: &str) -> Result<(), String> {
-    if url.is_empty() {
+    webclaw_fetch::url_security::validate_http_url(url)
-        return Err("Invalid URL: must not be empty".into());
+        .map(|_| ())
-    }
+        .map_err(|e| format!("Invalid URL: {e}"))
    match Url::parse(url) {
        Ok(parsed) if parsed.scheme() == "http" || parsed.scheme() == "https" => Ok(()),
        Ok(parsed) => Err(format!(
            "Invalid URL: scheme '{}' not allowed, must start with http:// or https://",
            parsed.scheme()
        )),
        Err(e) => Err(format!(
            "Invalid URL: {e}. Must start with http:// or https://"
        )),
    }
 }
 /// Timeout for local fetch calls (prevents hanging on tarpitting servers).
--- a/crates/webclaw-server/src/error.rs
+++ b/crates/webclaw-server/src/error.rs
@ -70,7 +70,12 @@ impl IntoResponse for ApiError {
 impl From<webclaw_fetch::FetchError> for ApiError {
    fn from(e: webclaw_fetch::FetchError) -> Self {
-        Self::Fetch(e.to_string())
+        match e {
            webclaw_fetch::FetchError::InvalidUrl(msg) => {
                Self::BadRequest(format!("invalid url: {msg}"))
            }
            other => Self::Fetch(other.to_string()),
        }
    }
 }
--- a/crates/webclaw-server/src/main.rs
+++ b/crates/webclaw-server/src/main.rs
@ -75,6 +75,15 @@ async fn main() -> anyhow::Result<()> {
        .compact()
        .init();
    if is_unspecified_addr(args.host)
        && args.api_key.is_none()
        && std::env::var_os("WEBCLAW_ALLOW_OPEN_PUBLIC").is_none()
    {
        anyhow::bail!(
            "refusing to bind 0.0.0.0/[::] without WEBCLAW_API_KEY; set WEBCLAW_API_KEY or WEBCLAW_ALLOW_OPEN_PUBLIC=1 to override"
        );
    }
    let state = AppState::new(args.api_key.clone())?;
    let v1 = Router::new()
@ -121,3 +130,10 @@ async fn main() -> anyhow::Result<()> {
    axum::serve(listener, app).await?;
    Ok(())
 }
 fn is_unspecified_addr(addr: IpAddr) -> bool {
    match addr {
        IpAddr::V4(ip) => ip.is_unspecified(),
        IpAddr::V6(ip) => ip.is_unspecified(),
    }
 }
--- a/crates/webclaw-server/src/routes/batch.rs
+++ b/crates/webclaw-server/src/routes/batch.rs
@ -37,6 +37,14 @@ pub async fn batch(
            req.urls.len()
        )));
    }
    let mut safe_urls = Vec::with_capacity(req.urls.len());
    for url in &req.urls {
        safe_urls.push(
            webclaw_fetch::url_security::validate_public_http_url(url)
                .await?
                .to_string(),
        );
    }
    let concurrency = req.concurrency.unwrap_or(5).clamp(1, HARD_MAX_CONCURRENCY);
@ -47,7 +55,7 @@ pub async fn batch(
        include_raw_html: false,
    };
-    let url_refs: Vec<&str> = req.urls.iter().map(|s| s.as_str()).collect();
+    let url_refs: Vec<&str> = safe_urls.iter().map(|s| s.as_str()).collect();
    let results = state
        .fetch()
        .fetch_and_extract_batch_with_options(&url_refs, concurrency, &options)
--- a/crates/webclaw-server/src/routes/scrape.rs
+++ b/crates/webclaw-server/src/routes/scrape.rs
@ -52,6 +52,7 @@ pub async fn scrape(
    if req.url.trim().is_empty() {
        return Err(ApiError::bad_request("`url` is required"));
    }
    let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
    let formats = req.formats.as_vec();
    let options = ExtractionOptions {
@ -63,11 +64,11 @@ pub async fn scrape(
    let extraction = state
        .fetch()
-        .fetch_and_extract_with_options(&req.url, &options)
+        .fetch_and_extract_with_options(url.as_str(), &options)
        .await?;
    let mut body = json!({
-        "url": extraction.metadata.url.clone().unwrap_or_else(|| req.url.clone()),
+        "url": extraction.metadata.url.clone().unwrap_or_else(|| url.to_string()),
        "metadata": extraction.metadata,
    });
    let obj = body.as_object_mut().expect("json::object");