fix(security): harden local fetch surfaces

This commit is contained in:
Valerio 2026-05-12 12:00:25 +02:00
parent af96628dc9
commit a611ae26f3
5 changed files with 94 additions and 15 deletions

View file

@ -849,11 +849,18 @@ async fn enrich_html_with_stylesheets(html: &str, base_url: &str) -> String {
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(5))
.redirect(reqwest::redirect::Policy::none())
.build()
.unwrap_or_default();
let mut extra_css = String::new();
for href in &hrefs {
if webclaw_fetch::url_security::validate_public_http_url(href)
.await
.is_err()
{
continue;
}
if let Ok(resp) = client.get(href).send().await
&& resp.status().is_success()
&& let Ok(body) = resp.text().await

View file

@ -9,10 +9,12 @@ use once_cell::sync::Lazy;
use regex::Regex;
use rquickjs::{Context, Runtime};
use scraper::{Html, Selector};
use std::time::{Duration, Instant};
use tracing::debug;
static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").unwrap());
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);
/// A blob of data extracted from JS execution.
pub struct JsDataBlob {
@ -49,6 +51,8 @@ pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
let rt = Runtime::new().expect("QuickJS runtime creation failed");
rt.set_memory_limit(64 * 1024 * 1024); // 64 MB
rt.set_max_stack_size(1024 * 1024); // 1 MB
let deadline = Instant::now() + JS_EVAL_TIMEOUT;
rt.set_interrupt_handler(Some(Box::new(move || Instant::now() >= deadline)));
let ctx = Context::full(&rt).expect("QuickJS context creation failed");
@ -464,6 +468,8 @@ fn walk_rsc_tree(value: &serde_json::Value, out: &mut Vec<String>, depth: usize)
#[cfg(test)]
mod tests {
use std::time::{Duration, Instant};
use super::*;
#[test]
@ -493,6 +499,29 @@ mod tests {
);
}
#[test]
fn js_eval_interrupts_infinite_loops() {
let html = r#"
<html>
<head>
<script>
while (true) {}
</script>
</head>
<body>hello</body>
</html>
"#;
let start = Instant::now();
let blobs = extract_js_data(html);
assert!(blobs.is_empty());
assert!(
start.elapsed() < Duration::from_secs(2),
"QuickJS execution should be interrupted quickly"
);
}
#[test]
fn skips_external_and_module_scripts() {
let html = r#"<html><body>

View file

@ -5,9 +5,7 @@
//! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order,
//! stream dependency, priorities) to match real browser fingerprints.
use std::time::Duration;
use std::borrow::Cow;
use std::{borrow::Cow, io, time::Duration};
use wreq::http2::{
Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
@ -21,6 +19,41 @@ use wreq::{Client, Emulation};
use crate::browser::BrowserVariant;
use crate::error::FetchError;
#[derive(Clone, Default)]
struct PublicDnsResolver;
impl wreq::dns::Resolve for PublicDnsResolver {
fn resolve(&self, name: wreq::dns::Name) -> wreq::dns::Resolving {
Box::pin(async move {
let addrs = tokio::net::lookup_host((name.as_str(), 0))
.await
.map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)?;
let mut public = Vec::new();
for addr in addrs {
if crate::url_security::is_blocked_ip(addr.ip()) {
let err: Box<dyn std::error::Error + Send + Sync> = Box::new(io::Error::new(
io::ErrorKind::PermissionDenied,
"DNS resolved to a blocked private or internal address",
));
return Err(err);
}
public.push(addr);
}
if public.is_empty() {
let err: Box<dyn std::error::Error + Send + Sync> = Box::new(io::Error::new(
io::ErrorKind::NotFound,
"host did not resolve to any addresses",
));
return Err(err);
}
Ok(Box::new(public.into_iter()) as wreq::dns::Addrs)
})
}
}
/// Chrome cipher list (TLS 1.3 + TLS 1.2 in Chrome's exact order).
const CHROME_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";
@ -503,6 +536,8 @@ pub fn build_client(
let proxy =
wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
builder = builder.proxy(proxy);
} else {
builder = builder.dns_resolver(PublicDnsResolver::default());
}
builder

View file

@ -163,7 +163,9 @@ mod tests {
Ipv4Addr::new(169, 254, 169, 254),
Ipv4Addr::new(172, 16, 0, 1),
Ipv4Addr::new(192, 168, 0, 1),
Ipv4Addr::new(192, 0, 0, 8),
Ipv4Addr::new(198, 18, 0, 1),
Ipv4Addr::new(255, 255, 255, 255),
] {
let url = format!("http://{ip}/");
assert!(validate_public_http_url(&url).await.is_err(), "{ip}");
@ -193,4 +195,9 @@ mod tests {
);
assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
}
#[tokio::test]
async fn blocks_localhost_domains_after_resolution() {
assert!(validate_public_http_url("http://localhost/").await.is_err());
}
}

View file

@ -51,9 +51,10 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
}
}
/// Validate that a URL is non-empty and has an http or https scheme.
fn validate_url(url: &str) -> Result<(), String> {
webclaw_fetch::url_security::validate_http_url(url)
/// Validate that a URL is public HTTP(S), matching the fetch-layer SSRF guard.
async fn validate_url(url: &str) -> Result<(), String> {
webclaw_fetch::url_security::validate_public_http_url(url)
.await
.map(|_| ())
.map_err(|e| format!("Invalid URL: {e}"))
}
@ -161,7 +162,7 @@ impl WebclawMcp {
/// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected.
#[tool]
async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> {
validate_url(&params.url)?;
validate_url(&params.url).await?;
let format = params.format.as_deref().unwrap_or("markdown");
let browser = parse_browser(params.browser.as_deref());
let include = params.include_selectors.unwrap_or_default();
@ -251,7 +252,7 @@ impl WebclawMcp {
/// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit.
#[tool]
async fn crawl(&self, Parameters(params): Parameters<CrawlParams>) -> Result<String, String> {
validate_url(&params.url)?;
validate_url(&params.url).await?;
if let Some(max) = params.max_pages
&& max > 500
@ -300,7 +301,7 @@ impl WebclawMcp {
/// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml).
#[tool]
async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> {
validate_url(&params.url)?;
validate_url(&params.url).await?;
let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, &params.url)
.await
.map_err(|e| format!("Sitemap discovery failed: {e}"))?;
@ -323,7 +324,7 @@ impl WebclawMcp {
return Err("batch is limited to 100 URLs per request".into());
}
for u in &params.urls {
validate_url(u)?;
validate_url(u).await?;
}
let format = params.format.as_deref().unwrap_or("markdown");
@ -365,7 +366,7 @@ impl WebclawMcp {
&self,
Parameters(params): Parameters<ExtractParams>,
) -> Result<String, String> {
validate_url(&params.url)?;
validate_url(&params.url).await?;
if params.schema.is_none() && params.prompt.is_none() {
return Err("Either 'schema' or 'prompt' is required for extraction.".into());
@ -422,7 +423,7 @@ impl WebclawMcp {
&self,
Parameters(params): Parameters<SummarizeParams>,
) -> Result<String, String> {
validate_url(&params.url)?;
validate_url(&params.url).await?;
// No local LLM — fall back to cloud API directly
if self.llm_chain.is_none() {
@ -464,7 +465,7 @@ impl WebclawMcp {
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
#[tool]
async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> {
validate_url(&params.url)?;
validate_url(&params.url).await?;
let previous: webclaw_core::ExtractionResult =
serde_json::from_str(&params.previous_snapshot)
.map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
@ -532,7 +533,7 @@ impl WebclawMcp {
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
#[tool]
async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> {
validate_url(&params.url)?;
validate_url(&params.url).await?;
let fetch_result =
tokio::time::timeout(LOCAL_FETCH_TIMEOUT, self.fetch_client.fetch(&params.url))
.await
@ -737,7 +738,7 @@ impl WebclawMcp {
&self,
Parameters(params): Parameters<VerticalParams>,
) -> Result<String, String> {
validate_url(&params.url)?;
validate_url(&params.url).await?;
// Use the cached Firefox client, not the default Chrome one.
// Reddit's `.json` endpoint rejects the wreq-Chrome TLS
// fingerprint with a 403 even from residential IPs (they