fix(security): harden local fetch surfaces

2026-07-23 07:21:02 +02:00 · 2026-05-12 12:00:25 +02:00 · 2026-05-12 12:00:25 +02:00 · a611ae26f3
commit a611ae26f3
parent af96628dc9
5 changed files with 94 additions and 15 deletions
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -849,11 +849,18 @@ async fn enrich_html_with_stylesheets(html: &str, base_url: &str) -> String {

    let client = reqwest::Client::builder()
        .timeout(std::time::Duration::from_secs(5))
+        .redirect(reqwest::redirect::Policy::none())
        .build()
        .unwrap_or_default();

    let mut extra_css = String::new();
    for href in &hrefs {
+        if webclaw_fetch::url_security::validate_public_http_url(href)
+            .await
+            .is_err()
+        {
+            continue;
+        }
        if let Ok(resp) = client.get(href).send().await
            && resp.status().is_success()
            && let Ok(body) = resp.text().await
--- a/crates/webclaw-core/src/js_eval.rs
+++ b/crates/webclaw-core/src/js_eval.rs
@ -9,10 +9,12 @@ use once_cell::sync::Lazy;
 use regex::Regex;
 use rquickjs::{Context, Runtime};
 use scraper::{Html, Selector};
+use std::time::{Duration, Instant};
 use tracing::debug;

 static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").unwrap());
 static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
+const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);

 /// A blob of data extracted from JS execution.
 pub struct JsDataBlob {
@ -49,6 +51,8 @@ pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
    let rt = Runtime::new().expect("QuickJS runtime creation failed");
    rt.set_memory_limit(64 * 1024 * 1024); // 64 MB
    rt.set_max_stack_size(1024 * 1024); // 1 MB
+    let deadline = Instant::now() + JS_EVAL_TIMEOUT;
+    rt.set_interrupt_handler(Some(Box::new(move || Instant::now() >= deadline)));

    let ctx = Context::full(&rt).expect("QuickJS context creation failed");

@ -464,6 +468,8 @@ fn walk_rsc_tree(value: &serde_json::Value, out: &mut Vec<String>, depth: usize)

 #[cfg(test)]
 mod tests {
+    use std::time::{Duration, Instant};
+
    use super::*;

    #[test]
@ -493,6 +499,29 @@ mod tests {
        );
    }

+    #[test]
+    fn js_eval_interrupts_infinite_loops() {
+        let html = r#"
+            <html>
+              <head>
+                <script>
+                  while (true) {}
+                </script>
+              </head>
+              <body>hello</body>
+            </html>
+        "#;
+
+        let start = Instant::now();
+        let blobs = extract_js_data(html);
+
+        assert!(blobs.is_empty());
+        assert!(
+            start.elapsed() < Duration::from_secs(2),
+            "QuickJS execution should be interrupted quickly"
+        );
+    }
+
    #[test]
    fn skips_external_and_module_scripts() {
        let html = r#"<html><body>
--- a/crates/webclaw-fetch/src/tls.rs
+++ b/crates/webclaw-fetch/src/tls.rs
@ -5,9 +5,7 @@
 //! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order,
 //! stream dependency, priorities) to match real browser fingerprints.

-use std::time::Duration;
-
-use std::borrow::Cow;
+use std::{borrow::Cow, io, time::Duration};

 use wreq::http2::{
    Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
@ -21,6 +19,41 @@ use wreq::{Client, Emulation};
 use crate::browser::BrowserVariant;
 use crate::error::FetchError;

+#[derive(Clone, Default)]
+struct PublicDnsResolver;
+
+impl wreq::dns::Resolve for PublicDnsResolver {
+    fn resolve(&self, name: wreq::dns::Name) -> wreq::dns::Resolving {
+        Box::pin(async move {
+            let addrs = tokio::net::lookup_host((name.as_str(), 0))
+                .await
+                .map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)?;
+            let mut public = Vec::new();
+
+            for addr in addrs {
+                if crate::url_security::is_blocked_ip(addr.ip()) {
+                    let err: Box<dyn std::error::Error + Send + Sync> = Box::new(io::Error::new(
+                        io::ErrorKind::PermissionDenied,
+                        "DNS resolved to a blocked private or internal address",
+                    ));
+                    return Err(err);
+                }
+                public.push(addr);
+            }
+
+            if public.is_empty() {
+                let err: Box<dyn std::error::Error + Send + Sync> = Box::new(io::Error::new(
+                    io::ErrorKind::NotFound,
+                    "host did not resolve to any addresses",
+                ));
+                return Err(err);
+            }
+
+            Ok(Box::new(public.into_iter()) as wreq::dns::Addrs)
+        })
+    }
+}
+
 /// Chrome cipher list (TLS 1.3 + TLS 1.2 in Chrome's exact order).
 const CHROME_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";

@ -503,6 +536,8 @@ pub fn build_client(
        let proxy =
            wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
        builder = builder.proxy(proxy);
+    } else {
+        builder = builder.dns_resolver(PublicDnsResolver::default());
    }

    builder
--- a/crates/webclaw-fetch/src/url_security.rs
+++ b/crates/webclaw-fetch/src/url_security.rs
@ -163,7 +163,9 @@ mod tests {
            Ipv4Addr::new(169, 254, 169, 254),
            Ipv4Addr::new(172, 16, 0, 1),
            Ipv4Addr::new(192, 168, 0, 1),
+            Ipv4Addr::new(192, 0, 0, 8),
            Ipv4Addr::new(198, 18, 0, 1),
+            Ipv4Addr::new(255, 255, 255, 255),
        ] {
            let url = format!("http://{ip}/");
            assert!(validate_public_http_url(&url).await.is_err(), "{ip}");
@ -193,4 +195,9 @@ mod tests {
        );
        assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
    }
+
+    #[tokio::test]
+    async fn blocks_localhost_domains_after_resolution() {
+        assert!(validate_public_http_url("http://localhost/").await.is_err());
+    }
 }
--- a/crates/webclaw-mcp/src/server.rs
+++ b/crates/webclaw-mcp/src/server.rs
@ -51,9 +51,10 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
    }
 }

-/// Validate that a URL is non-empty and has an http or https scheme.
-fn validate_url(url: &str) -> Result<(), String> {
-    webclaw_fetch::url_security::validate_http_url(url)
+/// Validate that a URL is public HTTP(S), matching the fetch-layer SSRF guard.
+async fn validate_url(url: &str) -> Result<(), String> {
+    webclaw_fetch::url_security::validate_public_http_url(url)
+        .await
        .map(|_| ())
        .map_err(|e| format!("Invalid URL: {e}"))
 }
@ -161,7 +162,7 @@ impl WebclawMcp {
    /// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected.
    #[tool]
    async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;
        let format = params.format.as_deref().unwrap_or("markdown");
        let browser = parse_browser(params.browser.as_deref());
        let include = params.include_selectors.unwrap_or_default();
@ -251,7 +252,7 @@ impl WebclawMcp {
    /// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit.
    #[tool]
    async fn crawl(&self, Parameters(params): Parameters<CrawlParams>) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;

        if let Some(max) = params.max_pages
            && max > 500
@ -300,7 +301,7 @@ impl WebclawMcp {
    /// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml).
    #[tool]
    async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;
        let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, &params.url)
            .await
            .map_err(|e| format!("Sitemap discovery failed: {e}"))?;
@ -323,7 +324,7 @@ impl WebclawMcp {
            return Err("batch is limited to 100 URLs per request".into());
        }
        for u in &params.urls {
-            validate_url(u)?;
+            validate_url(u).await?;
        }

        let format = params.format.as_deref().unwrap_or("markdown");
@ -365,7 +366,7 @@ impl WebclawMcp {
        &self,
        Parameters(params): Parameters<ExtractParams>,
    ) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;

        if params.schema.is_none() && params.prompt.is_none() {
            return Err("Either 'schema' or 'prompt' is required for extraction.".into());
@ -422,7 +423,7 @@ impl WebclawMcp {
        &self,
        Parameters(params): Parameters<SummarizeParams>,
    ) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;

        // No local LLM — fall back to cloud API directly
        if self.llm_chain.is_none() {
@ -464,7 +465,7 @@ impl WebclawMcp {
    /// Automatically falls back to the webclaw cloud API when bot protection is detected.
    #[tool]
    async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;
        let previous: webclaw_core::ExtractionResult =
            serde_json::from_str(&params.previous_snapshot)
                .map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
@ -532,7 +533,7 @@ impl WebclawMcp {
    /// Automatically falls back to the webclaw cloud API when bot protection is detected.
    #[tool]
    async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;
        let fetch_result =
            tokio::time::timeout(LOCAL_FETCH_TIMEOUT, self.fetch_client.fetch(&params.url))
                .await
@ -737,7 +738,7 @@ impl WebclawMcp {
        &self,
        Parameters(params): Parameters<VerticalParams>,
    ) -> Result<String, String> {
-        validate_url(&params.url)?;
+        validate_url(&params.url).await?;
        // Use the cached Firefox client, not the default Chrome one.
        // Reddit's `.json` endpoint rejects the wreq-Chrome TLS
        // fingerprint with a 403 even from residential IPs (they