diff --git a/Dockerfile b/Dockerfile index 6f84e06..552aea7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -73,11 +73,9 @@ COPY --from=builder /build/target/release/webclaw-server /usr/local/bin/webclaw- # as documentation; callers still need `-p 3000:3000` on `docker run`. EXPOSE 3000 -# Container default: bind all interfaces so `-p 3000:3000` works. The binary -# itself defaults to 127.0.0.1 (safe for `cargo run` on a laptop); inside -# Docker that would make the server unreachable, so we flip it here. -# Override with -e WEBCLAW_HOST=127.0.0.1 if you front this with another -# process in the same container. +# Container default: bind all interfaces so `-p 3000:3000` works. Public +# binding requires WEBCLAW_API_KEY; the binary refuses open-auth 0.0.0.0 +# unless WEBCLAW_ALLOW_OPEN_PUBLIC=1 is set explicitly for local testing. ENV WEBCLAW_HOST=0.0.0.0 # Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index 94d698f..4fff454 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -199,6 +199,8 @@ impl FetchClient { config.timeout, &config.headers, config.proxy.as_deref(), + config.follow_redirects, + config.max_redirects, ) }) .collect::, _>>()?; @@ -218,7 +220,14 @@ impl FetchClient { .iter() .map(|proxy| { let v = *variants.choose(&mut rng).unwrap(); - crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy)) + crate::tls::build_client( + v, + config.timeout, + &config.headers, + Some(proxy), + config.follow_redirects, + config.max_redirects, + ) }) .collect::, _>>()?; @@ -379,6 +388,8 @@ impl FetchClient { url: &str, extra: &[(&str, &str)], ) -> Result { + let parsed_url = crate::url_security::validate_public_http_url(url).await?; + let url = parsed_url.as_str(); let start = Instant::now(); let client = self.pick_client(url); @@ -463,13 +474,17 @@ impl FetchClient { url: &str, options: &webclaw_core::ExtractionOptions, ) -> Result { + let parsed_url = crate::url_security::validate_public_http_url(url).await?; + let url = parsed_url.as_str(); + // Reddit fallback: use their JSON API to get post + full comment tree. if crate::reddit::is_reddit_url(url) { let json_url = crate::reddit::json_url(url); + let json_url = crate::url_security::validate_public_http_url(&json_url).await?; debug!("reddit detected, fetching {json_url}"); let client = self.pick_client(url); - let resp = client.get(&json_url).send().await?; + let resp = client.get(json_url.as_str()).send().await?; let response = Response::from_wreq(resp).await?; if response.is_success() { let bytes = response.body(); @@ -491,7 +506,7 @@ impl FetchClient { && let Some(homepage) = extract_homepage(url) { debug!("challenge detected, warming cookies via {homepage}"); - let _ = client.get(&homepage).send().await; + let _ = self.fetch(&homepage).await; let resp = client.get(url).send().await?; response = Response::from_wreq(resp).await?; debug!("retried after cookie warmup: status={}", response.status()); diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index ca04bdb..029a7b6 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -15,6 +15,7 @@ pub mod proxy; pub mod reddit; pub mod sitemap; pub mod tls; +pub mod url_security; pub use browser::BrowserProfile; pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult}; diff --git a/crates/webclaw-fetch/src/tls.rs b/crates/webclaw-fetch/src/tls.rs index 308265b..fdaeb0b 100644 --- a/crates/webclaw-fetch/src/tls.rs +++ b/crates/webclaw-fetch/src/tls.rs @@ -455,6 +455,8 @@ pub fn build_client( timeout: Duration, extra_headers: &std::collections::HashMap, proxy: Option<&str>, + follow_redirects: bool, + max_redirects: u32, ) -> Result { // SafariIos26 builds its Emulation on top of wreq-util's base instead // of from scratch. See `safari_ios_emulation` for why. @@ -490,7 +492,10 @@ pub fn build_client( let mut builder = Client::builder() .emulation(emulation) - .redirect(wreq::redirect::Policy::limited(10)) + .redirect(ssrf_safe_redirect_policy( + follow_redirects, + max_redirects as usize, + )) .cookie_store(true) .timeout(timeout); @@ -504,3 +509,26 @@ pub fn build_client( .build() .map_err(|e| FetchError::Build(e.to_string())) } + +fn ssrf_safe_redirect_policy( + follow_redirects: bool, + max_redirects: usize, +) -> wreq::redirect::Policy { + if !follow_redirects { + return wreq::redirect::Policy::none(); + } + + wreq::redirect::Policy::custom(move |attempt| { + if attempt.previous.len() > max_redirects { + return attempt.error("too many redirects"); + } + + attempt.pending(|attempt| async move { + let next_url = attempt.uri.to_string(); + match crate::url_security::validate_public_http_url(&next_url).await { + Ok(_) => attempt.follow(), + Err(e) => attempt.error(e.to_string()), + } + }) + }) +} diff --git a/crates/webclaw-fetch/src/url_security.rs b/crates/webclaw-fetch/src/url_security.rs new file mode 100644 index 0000000..1d2b534 --- /dev/null +++ b/crates/webclaw-fetch/src/url_security.rs @@ -0,0 +1,196 @@ +//! SSRF guard for every server-side fetch. +//! +//! Callers may still do cheap parse validation at the edge, but this +//! module is the fetch-layer authority because redirects and helper +//! fetches also pass through it. + +use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; + +use tokio::net::lookup_host; +use url::{Host, Url}; + +use crate::error::FetchError; + +/// Parse a caller-provided URL and require an HTTP(S) host. +pub fn validate_http_url(raw: &str) -> Result { + let trimmed = raw.trim(); + if trimmed.is_empty() { + return Err(FetchError::InvalidUrl("URL must not be empty".into())); + } + + let parsed = + Url::parse(trimmed).map_err(|e| FetchError::InvalidUrl(format!("invalid URL: {e}")))?; + match parsed.scheme() { + "http" | "https" => {} + scheme => { + return Err(FetchError::InvalidUrl(format!( + "scheme '{scheme}' is not allowed, use http:// or https://" + ))); + } + } + + if parsed.host().is_none() { + return Err(FetchError::InvalidUrl("URL must include a host".into())); + } + + Ok(parsed) +} + +/// Parse, resolve, and reject private/internal destinations. +/// +/// A domain is rejected if any resolved address is private or reserved. +/// That is intentionally conservative: mixed public/private DNS answers +/// are unsafe for server-side fetching. +pub async fn validate_public_http_url(raw: &str) -> Result { + let parsed = validate_http_url(raw)?; + validate_url_host_is_public(&parsed).await?; + Ok(parsed) +} + +async fn validate_url_host_is_public(url: &Url) -> Result<(), FetchError> { + match url.host() { + Some(Host::Ipv4(ip)) => reject_blocked_ip(IpAddr::V4(ip)), + Some(Host::Ipv6(ip)) => reject_blocked_ip(IpAddr::V6(ip)), + Some(Host::Domain(host)) => { + let port = url + .port_or_known_default() + .ok_or_else(|| FetchError::InvalidUrl("URL must include a known port".into()))?; + let addrs = lookup_host((host, port)) + .await + .map_err(|e| FetchError::InvalidUrl(format!("failed to resolve host: {e}")))?; + + let mut resolved = false; + for addr in addrs { + resolved = true; + reject_blocked_ip(addr.ip())?; + } + if !resolved { + return Err(FetchError::InvalidUrl( + "host did not resolve to any addresses".into(), + )); + } + Ok(()) + } + None => Err(FetchError::InvalidUrl("URL must include a host".into())), + } +} + +fn reject_blocked_ip(ip: IpAddr) -> Result<(), FetchError> { + if is_blocked_ip(ip) { + Err(FetchError::InvalidUrl( + "URL resolves to a blocked private or internal address".into(), + )) + } else { + Ok(()) + } +} + +/// Return true for IP ranges that should never be fetched server-side. +pub fn is_blocked_ip(ip: IpAddr) -> bool { + match ip { + IpAddr::V4(ip) => is_blocked_ipv4(ip), + IpAddr::V6(ip) => is_blocked_ipv6(ip), + } +} + +fn is_blocked_ipv4(ip: Ipv4Addr) -> bool { + let o = ip.octets(); + + ip.is_unspecified() + || ip.is_loopback() + || ip.is_private() + || ip.is_link_local() + || o[0] == 0 + || o[0] >= 224 + || (o[0] == 100 && (64..=127).contains(&o[1])) + || (o[0] == 192 && o[1] == 0 && o[2] == 0) + || (o[0] == 192 && o[1] == 0 && o[2] == 2) + || (o[0] == 198 && (18..=19).contains(&o[1])) + || (o[0] == 198 && o[1] == 51 && o[2] == 100) + || (o[0] == 203 && o[1] == 0 && o[2] == 113) +} + +fn is_blocked_ipv6(ip: Ipv6Addr) -> bool { + let s = ip.segments(); + + ip.is_unspecified() + || ip.is_loopback() + || ip.is_multicast() + || (s[0] & 0xfe00) == 0xfc00 + || (s[0] & 0xffc0) == 0xfe80 + || (s[0] == 0x0064 && s[1] == 0xff9b && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0) + || (s[0] == 0x2001 && s[1] == 0x0db8) + || embedded_ipv4(ip).is_some_and(is_blocked_ipv4) +} + +fn embedded_ipv4(ip: Ipv6Addr) -> Option { + let s = ip.segments(); + + if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0xffff { + return Some(Ipv4Addr::new( + (s[6] >> 8) as u8, + s[6] as u8, + (s[7] >> 8) as u8, + s[7] as u8, + )); + } + + if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0 { + return Some(Ipv4Addr::new( + (s[6] >> 8) as u8, + s[6] as u8, + (s[7] >> 8) as u8, + s[7] as u8, + )); + } + + None +} + +#[cfg(test)] +mod tests { + use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; + + use super::{is_blocked_ip, validate_public_http_url}; + + #[tokio::test] + async fn blocks_ipv4_internal_ranges() { + for ip in [ + Ipv4Addr::new(0, 0, 0, 0), + Ipv4Addr::new(10, 0, 0, 1), + Ipv4Addr::new(100, 64, 0, 1), + Ipv4Addr::new(127, 0, 0, 1), + Ipv4Addr::new(169, 254, 169, 254), + Ipv4Addr::new(172, 16, 0, 1), + Ipv4Addr::new(192, 168, 0, 1), + Ipv4Addr::new(198, 18, 0, 1), + ] { + let url = format!("http://{ip}/"); + assert!(validate_public_http_url(&url).await.is_err(), "{ip}"); + } + } + + #[tokio::test] + async fn blocks_ipv6_internal_ranges() { + for ip in [ + Ipv6Addr::LOCALHOST, + Ipv6Addr::UNSPECIFIED, + "fc00::1".parse().unwrap(), + "fe80::1".parse().unwrap(), + "64:ff9b::7f00:1".parse().unwrap(), + "::ffff:127.0.0.1".parse().unwrap(), + ] { + assert!(is_blocked_ip(IpAddr::V6(ip)), "{ip}"); + } + } + + #[tokio::test] + async fn allows_public_ip_literals() { + assert!( + validate_public_http_url("https://93.184.216.34/") + .await + .is_ok() + ); + assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false); + } +} diff --git a/crates/webclaw-mcp/src/server.rs b/crates/webclaw-mcp/src/server.rs index 45e8647..d56032d 100644 --- a/crates/webclaw-mcp/src/server.rs +++ b/crates/webclaw-mcp/src/server.rs @@ -13,7 +13,6 @@ use rmcp::model::{Implementation, ServerCapabilities, ServerInfo}; use rmcp::{ServerHandler, tool, tool_handler, tool_router}; use serde_json::json; use tracing::{error, info, warn}; -use url::Url; use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult}; @@ -54,19 +53,9 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile { /// Validate that a URL is non-empty and has an http or https scheme. fn validate_url(url: &str) -> Result<(), String> { - if url.is_empty() { - return Err("Invalid URL: must not be empty".into()); - } - match Url::parse(url) { - Ok(parsed) if parsed.scheme() == "http" || parsed.scheme() == "https" => Ok(()), - Ok(parsed) => Err(format!( - "Invalid URL: scheme '{}' not allowed, must start with http:// or https://", - parsed.scheme() - )), - Err(e) => Err(format!( - "Invalid URL: {e}. Must start with http:// or https://" - )), - } + webclaw_fetch::url_security::validate_http_url(url) + .map(|_| ()) + .map_err(|e| format!("Invalid URL: {e}")) } /// Timeout for local fetch calls (prevents hanging on tarpitting servers). diff --git a/crates/webclaw-server/src/error.rs b/crates/webclaw-server/src/error.rs index c49a1c9..7f1d36e 100644 --- a/crates/webclaw-server/src/error.rs +++ b/crates/webclaw-server/src/error.rs @@ -70,7 +70,12 @@ impl IntoResponse for ApiError { impl From for ApiError { fn from(e: webclaw_fetch::FetchError) -> Self { - Self::Fetch(e.to_string()) + match e { + webclaw_fetch::FetchError::InvalidUrl(msg) => { + Self::BadRequest(format!("invalid url: {msg}")) + } + other => Self::Fetch(other.to_string()), + } } } diff --git a/crates/webclaw-server/src/main.rs b/crates/webclaw-server/src/main.rs index f4cfdcb..06f2451 100644 --- a/crates/webclaw-server/src/main.rs +++ b/crates/webclaw-server/src/main.rs @@ -75,6 +75,15 @@ async fn main() -> anyhow::Result<()> { .compact() .init(); + if is_unspecified_addr(args.host) + && args.api_key.is_none() + && std::env::var_os("WEBCLAW_ALLOW_OPEN_PUBLIC").is_none() + { + anyhow::bail!( + "refusing to bind 0.0.0.0/[::] without WEBCLAW_API_KEY; set WEBCLAW_API_KEY or WEBCLAW_ALLOW_OPEN_PUBLIC=1 to override" + ); + } + let state = AppState::new(args.api_key.clone())?; let v1 = Router::new() @@ -121,3 +130,10 @@ async fn main() -> anyhow::Result<()> { axum::serve(listener, app).await?; Ok(()) } + +fn is_unspecified_addr(addr: IpAddr) -> bool { + match addr { + IpAddr::V4(ip) => ip.is_unspecified(), + IpAddr::V6(ip) => ip.is_unspecified(), + } +} diff --git a/crates/webclaw-server/src/routes/batch.rs b/crates/webclaw-server/src/routes/batch.rs index 99533c9..18ac1f4 100644 --- a/crates/webclaw-server/src/routes/batch.rs +++ b/crates/webclaw-server/src/routes/batch.rs @@ -37,6 +37,14 @@ pub async fn batch( req.urls.len() ))); } + let mut safe_urls = Vec::with_capacity(req.urls.len()); + for url in &req.urls { + safe_urls.push( + webclaw_fetch::url_security::validate_public_http_url(url) + .await? + .to_string(), + ); + } let concurrency = req.concurrency.unwrap_or(5).clamp(1, HARD_MAX_CONCURRENCY); @@ -47,7 +55,7 @@ pub async fn batch( include_raw_html: false, }; - let url_refs: Vec<&str> = req.urls.iter().map(|s| s.as_str()).collect(); + let url_refs: Vec<&str> = safe_urls.iter().map(|s| s.as_str()).collect(); let results = state .fetch() .fetch_and_extract_batch_with_options(&url_refs, concurrency, &options) diff --git a/crates/webclaw-server/src/routes/scrape.rs b/crates/webclaw-server/src/routes/scrape.rs index 1c5fc52..2f7e73f 100644 --- a/crates/webclaw-server/src/routes/scrape.rs +++ b/crates/webclaw-server/src/routes/scrape.rs @@ -52,6 +52,7 @@ pub async fn scrape( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; let formats = req.formats.as_vec(); let options = ExtractionOptions { @@ -63,11 +64,11 @@ pub async fn scrape( let extraction = state .fetch() - .fetch_and_extract_with_options(&req.url, &options) + .fetch_and_extract_with_options(url.as_str(), &options) .await?; let mut body = json!({ - "url": extraction.metadata.url.clone().unwrap_or_else(|| req.url.clone()), + "url": extraction.metadata.url.clone().unwrap_or_else(|| url.to_string()), "metadata": extraction.metadata, }); let obj = body.as_object_mut().expect("json::object");