fix: harden fetch URL validation

This commit is contained in:
Valerio 2026-05-04 11:50:57 +02:00
parent 23544f8fac
commit bdf81fe6bf
10 changed files with 284 additions and 27 deletions

View file

@ -73,11 +73,9 @@ COPY --from=builder /build/target/release/webclaw-server /usr/local/bin/webclaw-
# as documentation; callers still need `-p 3000:3000` on `docker run`. # as documentation; callers still need `-p 3000:3000` on `docker run`.
EXPOSE 3000 EXPOSE 3000
# Container default: bind all interfaces so `-p 3000:3000` works. The binary # Container default: bind all interfaces so `-p 3000:3000` works. Public
# itself defaults to 127.0.0.1 (safe for `cargo run` on a laptop); inside # binding requires WEBCLAW_API_KEY; the binary refuses open-auth 0.0.0.0
# Docker that would make the server unreachable, so we flip it here. # unless WEBCLAW_ALLOW_OPEN_PUBLIC=1 is set explicitly for local testing.
# Override with -e WEBCLAW_HOST=127.0.0.1 if you front this with another
# process in the same container.
ENV WEBCLAW_HOST=0.0.0.0 ENV WEBCLAW_HOST=0.0.0.0
# Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other # Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other

View file

@ -199,6 +199,8 @@ impl FetchClient {
config.timeout, config.timeout,
&config.headers, &config.headers,
config.proxy.as_deref(), config.proxy.as_deref(),
config.follow_redirects,
config.max_redirects,
) )
}) })
.collect::<Result<Vec<_>, _>>()?; .collect::<Result<Vec<_>, _>>()?;
@ -218,7 +220,14 @@ impl FetchClient {
.iter() .iter()
.map(|proxy| { .map(|proxy| {
let v = *variants.choose(&mut rng).unwrap(); let v = *variants.choose(&mut rng).unwrap();
crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy)) crate::tls::build_client(
v,
config.timeout,
&config.headers,
Some(proxy),
config.follow_redirects,
config.max_redirects,
)
}) })
.collect::<Result<Vec<_>, _>>()?; .collect::<Result<Vec<_>, _>>()?;
@ -379,6 +388,8 @@ impl FetchClient {
url: &str, url: &str,
extra: &[(&str, &str)], extra: &[(&str, &str)],
) -> Result<FetchResult, FetchError> { ) -> Result<FetchResult, FetchError> {
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
let url = parsed_url.as_str();
let start = Instant::now(); let start = Instant::now();
let client = self.pick_client(url); let client = self.pick_client(url);
@ -463,13 +474,17 @@ impl FetchClient {
url: &str, url: &str,
options: &webclaw_core::ExtractionOptions, options: &webclaw_core::ExtractionOptions,
) -> Result<webclaw_core::ExtractionResult, FetchError> { ) -> Result<webclaw_core::ExtractionResult, FetchError> {
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
let url = parsed_url.as_str();
// Reddit fallback: use their JSON API to get post + full comment tree. // Reddit fallback: use their JSON API to get post + full comment tree.
if crate::reddit::is_reddit_url(url) { if crate::reddit::is_reddit_url(url) {
let json_url = crate::reddit::json_url(url); let json_url = crate::reddit::json_url(url);
let json_url = crate::url_security::validate_public_http_url(&json_url).await?;
debug!("reddit detected, fetching {json_url}"); debug!("reddit detected, fetching {json_url}");
let client = self.pick_client(url); let client = self.pick_client(url);
let resp = client.get(&json_url).send().await?; let resp = client.get(json_url.as_str()).send().await?;
let response = Response::from_wreq(resp).await?; let response = Response::from_wreq(resp).await?;
if response.is_success() { if response.is_success() {
let bytes = response.body(); let bytes = response.body();
@ -491,7 +506,7 @@ impl FetchClient {
&& let Some(homepage) = extract_homepage(url) && let Some(homepage) = extract_homepage(url)
{ {
debug!("challenge detected, warming cookies via {homepage}"); debug!("challenge detected, warming cookies via {homepage}");
let _ = client.get(&homepage).send().await; let _ = self.fetch(&homepage).await;
let resp = client.get(url).send().await?; let resp = client.get(url).send().await?;
response = Response::from_wreq(resp).await?; response = Response::from_wreq(resp).await?;
debug!("retried after cookie warmup: status={}", response.status()); debug!("retried after cookie warmup: status={}", response.status());

View file

@ -15,6 +15,7 @@ pub mod proxy;
pub mod reddit; pub mod reddit;
pub mod sitemap; pub mod sitemap;
pub mod tls; pub mod tls;
pub mod url_security;
pub use browser::BrowserProfile; pub use browser::BrowserProfile;
pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult}; pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};

View file

@ -455,6 +455,8 @@ pub fn build_client(
timeout: Duration, timeout: Duration,
extra_headers: &std::collections::HashMap<String, String>, extra_headers: &std::collections::HashMap<String, String>,
proxy: Option<&str>, proxy: Option<&str>,
follow_redirects: bool,
max_redirects: u32,
) -> Result<Client, FetchError> { ) -> Result<Client, FetchError> {
// SafariIos26 builds its Emulation on top of wreq-util's base instead // SafariIos26 builds its Emulation on top of wreq-util's base instead
// of from scratch. See `safari_ios_emulation` for why. // of from scratch. See `safari_ios_emulation` for why.
@ -490,7 +492,10 @@ pub fn build_client(
let mut builder = Client::builder() let mut builder = Client::builder()
.emulation(emulation) .emulation(emulation)
.redirect(wreq::redirect::Policy::limited(10)) .redirect(ssrf_safe_redirect_policy(
follow_redirects,
max_redirects as usize,
))
.cookie_store(true) .cookie_store(true)
.timeout(timeout); .timeout(timeout);
@ -504,3 +509,26 @@ pub fn build_client(
.build() .build()
.map_err(|e| FetchError::Build(e.to_string())) .map_err(|e| FetchError::Build(e.to_string()))
} }
fn ssrf_safe_redirect_policy(
follow_redirects: bool,
max_redirects: usize,
) -> wreq::redirect::Policy {
if !follow_redirects {
return wreq::redirect::Policy::none();
}
wreq::redirect::Policy::custom(move |attempt| {
if attempt.previous.len() > max_redirects {
return attempt.error("too many redirects");
}
attempt.pending(|attempt| async move {
let next_url = attempt.uri.to_string();
match crate::url_security::validate_public_http_url(&next_url).await {
Ok(_) => attempt.follow(),
Err(e) => attempt.error(e.to_string()),
}
})
})
}

View file

@ -0,0 +1,196 @@
//! SSRF guard for every server-side fetch.
//!
//! Callers may still do cheap parse validation at the edge, but this
//! module is the fetch-layer authority because redirects and helper
//! fetches also pass through it.
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
use tokio::net::lookup_host;
use url::{Host, Url};
use crate::error::FetchError;
/// Parse a caller-provided URL and require an HTTP(S) host.
pub fn validate_http_url(raw: &str) -> Result<Url, FetchError> {
let trimmed = raw.trim();
if trimmed.is_empty() {
return Err(FetchError::InvalidUrl("URL must not be empty".into()));
}
let parsed =
Url::parse(trimmed).map_err(|e| FetchError::InvalidUrl(format!("invalid URL: {e}")))?;
match parsed.scheme() {
"http" | "https" => {}
scheme => {
return Err(FetchError::InvalidUrl(format!(
"scheme '{scheme}' is not allowed, use http:// or https://"
)));
}
}
if parsed.host().is_none() {
return Err(FetchError::InvalidUrl("URL must include a host".into()));
}
Ok(parsed)
}
/// Parse, resolve, and reject private/internal destinations.
///
/// A domain is rejected if any resolved address is private or reserved.
/// That is intentionally conservative: mixed public/private DNS answers
/// are unsafe for server-side fetching.
pub async fn validate_public_http_url(raw: &str) -> Result<Url, FetchError> {
let parsed = validate_http_url(raw)?;
validate_url_host_is_public(&parsed).await?;
Ok(parsed)
}
async fn validate_url_host_is_public(url: &Url) -> Result<(), FetchError> {
match url.host() {
Some(Host::Ipv4(ip)) => reject_blocked_ip(IpAddr::V4(ip)),
Some(Host::Ipv6(ip)) => reject_blocked_ip(IpAddr::V6(ip)),
Some(Host::Domain(host)) => {
let port = url
.port_or_known_default()
.ok_or_else(|| FetchError::InvalidUrl("URL must include a known port".into()))?;
let addrs = lookup_host((host, port))
.await
.map_err(|e| FetchError::InvalidUrl(format!("failed to resolve host: {e}")))?;
let mut resolved = false;
for addr in addrs {
resolved = true;
reject_blocked_ip(addr.ip())?;
}
if !resolved {
return Err(FetchError::InvalidUrl(
"host did not resolve to any addresses".into(),
));
}
Ok(())
}
None => Err(FetchError::InvalidUrl("URL must include a host".into())),
}
}
fn reject_blocked_ip(ip: IpAddr) -> Result<(), FetchError> {
if is_blocked_ip(ip) {
Err(FetchError::InvalidUrl(
"URL resolves to a blocked private or internal address".into(),
))
} else {
Ok(())
}
}
/// Return true for IP ranges that should never be fetched server-side.
pub fn is_blocked_ip(ip: IpAddr) -> bool {
match ip {
IpAddr::V4(ip) => is_blocked_ipv4(ip),
IpAddr::V6(ip) => is_blocked_ipv6(ip),
}
}
fn is_blocked_ipv4(ip: Ipv4Addr) -> bool {
let o = ip.octets();
ip.is_unspecified()
|| ip.is_loopback()
|| ip.is_private()
|| ip.is_link_local()
|| o[0] == 0
|| o[0] >= 224
|| (o[0] == 100 && (64..=127).contains(&o[1]))
|| (o[0] == 192 && o[1] == 0 && o[2] == 0)
|| (o[0] == 192 && o[1] == 0 && o[2] == 2)
|| (o[0] == 198 && (18..=19).contains(&o[1]))
|| (o[0] == 198 && o[1] == 51 && o[2] == 100)
|| (o[0] == 203 && o[1] == 0 && o[2] == 113)
}
fn is_blocked_ipv6(ip: Ipv6Addr) -> bool {
let s = ip.segments();
ip.is_unspecified()
|| ip.is_loopback()
|| ip.is_multicast()
|| (s[0] & 0xfe00) == 0xfc00
|| (s[0] & 0xffc0) == 0xfe80
|| (s[0] == 0x0064 && s[1] == 0xff9b && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0)
|| (s[0] == 0x2001 && s[1] == 0x0db8)
|| embedded_ipv4(ip).is_some_and(is_blocked_ipv4)
}
fn embedded_ipv4(ip: Ipv6Addr) -> Option<Ipv4Addr> {
let s = ip.segments();
if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0xffff {
return Some(Ipv4Addr::new(
(s[6] >> 8) as u8,
s[6] as u8,
(s[7] >> 8) as u8,
s[7] as u8,
));
}
if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0 {
return Some(Ipv4Addr::new(
(s[6] >> 8) as u8,
s[6] as u8,
(s[7] >> 8) as u8,
s[7] as u8,
));
}
None
}
#[cfg(test)]
mod tests {
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
use super::{is_blocked_ip, validate_public_http_url};
#[tokio::test]
async fn blocks_ipv4_internal_ranges() {
for ip in [
Ipv4Addr::new(0, 0, 0, 0),
Ipv4Addr::new(10, 0, 0, 1),
Ipv4Addr::new(100, 64, 0, 1),
Ipv4Addr::new(127, 0, 0, 1),
Ipv4Addr::new(169, 254, 169, 254),
Ipv4Addr::new(172, 16, 0, 1),
Ipv4Addr::new(192, 168, 0, 1),
Ipv4Addr::new(198, 18, 0, 1),
] {
let url = format!("http://{ip}/");
assert!(validate_public_http_url(&url).await.is_err(), "{ip}");
}
}
#[tokio::test]
async fn blocks_ipv6_internal_ranges() {
for ip in [
Ipv6Addr::LOCALHOST,
Ipv6Addr::UNSPECIFIED,
"fc00::1".parse().unwrap(),
"fe80::1".parse().unwrap(),
"64:ff9b::7f00:1".parse().unwrap(),
"::ffff:127.0.0.1".parse().unwrap(),
] {
assert!(is_blocked_ip(IpAddr::V6(ip)), "{ip}");
}
}
#[tokio::test]
async fn allows_public_ip_literals() {
assert!(
validate_public_http_url("https://93.184.216.34/")
.await
.is_ok()
);
assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
}
}

View file

@ -13,7 +13,6 @@ use rmcp::model::{Implementation, ServerCapabilities, ServerInfo};
use rmcp::{ServerHandler, tool, tool_handler, tool_router}; use rmcp::{ServerHandler, tool, tool_handler, tool_router};
use serde_json::json; use serde_json::json;
use tracing::{error, info, warn}; use tracing::{error, info, warn};
use url::Url;
use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult}; use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult};
@ -54,19 +53,9 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
/// Validate that a URL is non-empty and has an http or https scheme. /// Validate that a URL is non-empty and has an http or https scheme.
fn validate_url(url: &str) -> Result<(), String> { fn validate_url(url: &str) -> Result<(), String> {
if url.is_empty() { webclaw_fetch::url_security::validate_http_url(url)
return Err("Invalid URL: must not be empty".into()); .map(|_| ())
} .map_err(|e| format!("Invalid URL: {e}"))
match Url::parse(url) {
Ok(parsed) if parsed.scheme() == "http" || parsed.scheme() == "https" => Ok(()),
Ok(parsed) => Err(format!(
"Invalid URL: scheme '{}' not allowed, must start with http:// or https://",
parsed.scheme()
)),
Err(e) => Err(format!(
"Invalid URL: {e}. Must start with http:// or https://"
)),
}
} }
/// Timeout for local fetch calls (prevents hanging on tarpitting servers). /// Timeout for local fetch calls (prevents hanging on tarpitting servers).

View file

@ -70,7 +70,12 @@ impl IntoResponse for ApiError {
impl From<webclaw_fetch::FetchError> for ApiError { impl From<webclaw_fetch::FetchError> for ApiError {
fn from(e: webclaw_fetch::FetchError) -> Self { fn from(e: webclaw_fetch::FetchError) -> Self {
Self::Fetch(e.to_string()) match e {
webclaw_fetch::FetchError::InvalidUrl(msg) => {
Self::BadRequest(format!("invalid url: {msg}"))
}
other => Self::Fetch(other.to_string()),
}
} }
} }

View file

@ -75,6 +75,15 @@ async fn main() -> anyhow::Result<()> {
.compact() .compact()
.init(); .init();
if is_unspecified_addr(args.host)
&& args.api_key.is_none()
&& std::env::var_os("WEBCLAW_ALLOW_OPEN_PUBLIC").is_none()
{
anyhow::bail!(
"refusing to bind 0.0.0.0/[::] without WEBCLAW_API_KEY; set WEBCLAW_API_KEY or WEBCLAW_ALLOW_OPEN_PUBLIC=1 to override"
);
}
let state = AppState::new(args.api_key.clone())?; let state = AppState::new(args.api_key.clone())?;
let v1 = Router::new() let v1 = Router::new()
@ -121,3 +130,10 @@ async fn main() -> anyhow::Result<()> {
axum::serve(listener, app).await?; axum::serve(listener, app).await?;
Ok(()) Ok(())
} }
fn is_unspecified_addr(addr: IpAddr) -> bool {
match addr {
IpAddr::V4(ip) => ip.is_unspecified(),
IpAddr::V6(ip) => ip.is_unspecified(),
}
}

View file

@ -37,6 +37,14 @@ pub async fn batch(
req.urls.len() req.urls.len()
))); )));
} }
let mut safe_urls = Vec::with_capacity(req.urls.len());
for url in &req.urls {
safe_urls.push(
webclaw_fetch::url_security::validate_public_http_url(url)
.await?
.to_string(),
);
}
let concurrency = req.concurrency.unwrap_or(5).clamp(1, HARD_MAX_CONCURRENCY); let concurrency = req.concurrency.unwrap_or(5).clamp(1, HARD_MAX_CONCURRENCY);
@ -47,7 +55,7 @@ pub async fn batch(
include_raw_html: false, include_raw_html: false,
}; };
let url_refs: Vec<&str> = req.urls.iter().map(|s| s.as_str()).collect(); let url_refs: Vec<&str> = safe_urls.iter().map(|s| s.as_str()).collect();
let results = state let results = state
.fetch() .fetch()
.fetch_and_extract_batch_with_options(&url_refs, concurrency, &options) .fetch_and_extract_batch_with_options(&url_refs, concurrency, &options)

View file

@ -52,6 +52,7 @@ pub async fn scrape(
if req.url.trim().is_empty() { if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required")); return Err(ApiError::bad_request("`url` is required"));
} }
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
let formats = req.formats.as_vec(); let formats = req.formats.as_vec();
let options = ExtractionOptions { let options = ExtractionOptions {
@ -63,11 +64,11 @@ pub async fn scrape(
let extraction = state let extraction = state
.fetch() .fetch()
.fetch_and_extract_with_options(&req.url, &options) .fetch_and_extract_with_options(url.as_str(), &options)
.await?; .await?;
let mut body = json!({ let mut body = json!({
"url": extraction.metadata.url.clone().unwrap_or_else(|| req.url.clone()), "url": extraction.metadata.url.clone().unwrap_or_else(|| url.to_string()),
"metadata": extraction.metadata, "metadata": extraction.metadata,
}); });
let obj = body.as_object_mut().expect("json::object"); let obj = body.as_object_mut().expect("json::object");