mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
fix: harden fetch URL validation
This commit is contained in:
parent
23544f8fac
commit
bdf81fe6bf
10 changed files with 284 additions and 27 deletions
|
|
@ -199,6 +199,8 @@ impl FetchClient {
|
|||
config.timeout,
|
||||
&config.headers,
|
||||
config.proxy.as_deref(),
|
||||
config.follow_redirects,
|
||||
config.max_redirects,
|
||||
)
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
|
@ -218,7 +220,14 @@ impl FetchClient {
|
|||
.iter()
|
||||
.map(|proxy| {
|
||||
let v = *variants.choose(&mut rng).unwrap();
|
||||
crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy))
|
||||
crate::tls::build_client(
|
||||
v,
|
||||
config.timeout,
|
||||
&config.headers,
|
||||
Some(proxy),
|
||||
config.follow_redirects,
|
||||
config.max_redirects,
|
||||
)
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
|
|
@ -379,6 +388,8 @@ impl FetchClient {
|
|||
url: &str,
|
||||
extra: &[(&str, &str)],
|
||||
) -> Result<FetchResult, FetchError> {
|
||||
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
|
||||
let url = parsed_url.as_str();
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
|
||||
|
|
@ -463,13 +474,17 @@ impl FetchClient {
|
|||
url: &str,
|
||||
options: &webclaw_core::ExtractionOptions,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
|
||||
let url = parsed_url.as_str();
|
||||
|
||||
// Reddit fallback: use their JSON API to get post + full comment tree.
|
||||
if crate::reddit::is_reddit_url(url) {
|
||||
let json_url = crate::reddit::json_url(url);
|
||||
let json_url = crate::url_security::validate_public_http_url(&json_url).await?;
|
||||
debug!("reddit detected, fetching {json_url}");
|
||||
|
||||
let client = self.pick_client(url);
|
||||
let resp = client.get(&json_url).send().await?;
|
||||
let resp = client.get(json_url.as_str()).send().await?;
|
||||
let response = Response::from_wreq(resp).await?;
|
||||
if response.is_success() {
|
||||
let bytes = response.body();
|
||||
|
|
@ -491,7 +506,7 @@ impl FetchClient {
|
|||
&& let Some(homepage) = extract_homepage(url)
|
||||
{
|
||||
debug!("challenge detected, warming cookies via {homepage}");
|
||||
let _ = client.get(&homepage).send().await;
|
||||
let _ = self.fetch(&homepage).await;
|
||||
let resp = client.get(url).send().await?;
|
||||
response = Response::from_wreq(resp).await?;
|
||||
debug!("retried after cookie warmup: status={}", response.status());
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ pub mod proxy;
|
|||
pub mod reddit;
|
||||
pub mod sitemap;
|
||||
pub mod tls;
|
||||
pub mod url_security;
|
||||
|
||||
pub use browser::BrowserProfile;
|
||||
pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
|
||||
|
|
|
|||
|
|
@ -455,6 +455,8 @@ pub fn build_client(
|
|||
timeout: Duration,
|
||||
extra_headers: &std::collections::HashMap<String, String>,
|
||||
proxy: Option<&str>,
|
||||
follow_redirects: bool,
|
||||
max_redirects: u32,
|
||||
) -> Result<Client, FetchError> {
|
||||
// SafariIos26 builds its Emulation on top of wreq-util's base instead
|
||||
// of from scratch. See `safari_ios_emulation` for why.
|
||||
|
|
@ -490,7 +492,10 @@ pub fn build_client(
|
|||
|
||||
let mut builder = Client::builder()
|
||||
.emulation(emulation)
|
||||
.redirect(wreq::redirect::Policy::limited(10))
|
||||
.redirect(ssrf_safe_redirect_policy(
|
||||
follow_redirects,
|
||||
max_redirects as usize,
|
||||
))
|
||||
.cookie_store(true)
|
||||
.timeout(timeout);
|
||||
|
||||
|
|
@ -504,3 +509,26 @@ pub fn build_client(
|
|||
.build()
|
||||
.map_err(|e| FetchError::Build(e.to_string()))
|
||||
}
|
||||
|
||||
fn ssrf_safe_redirect_policy(
|
||||
follow_redirects: bool,
|
||||
max_redirects: usize,
|
||||
) -> wreq::redirect::Policy {
|
||||
if !follow_redirects {
|
||||
return wreq::redirect::Policy::none();
|
||||
}
|
||||
|
||||
wreq::redirect::Policy::custom(move |attempt| {
|
||||
if attempt.previous.len() > max_redirects {
|
||||
return attempt.error("too many redirects");
|
||||
}
|
||||
|
||||
attempt.pending(|attempt| async move {
|
||||
let next_url = attempt.uri.to_string();
|
||||
match crate::url_security::validate_public_http_url(&next_url).await {
|
||||
Ok(_) => attempt.follow(),
|
||||
Err(e) => attempt.error(e.to_string()),
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
|
|
|
|||
196
crates/webclaw-fetch/src/url_security.rs
Normal file
196
crates/webclaw-fetch/src/url_security.rs
Normal file
|
|
@ -0,0 +1,196 @@
|
|||
//! SSRF guard for every server-side fetch.
|
||||
//!
|
||||
//! Callers may still do cheap parse validation at the edge, but this
|
||||
//! module is the fetch-layer authority because redirects and helper
|
||||
//! fetches also pass through it.
|
||||
|
||||
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
|
||||
|
||||
use tokio::net::lookup_host;
|
||||
use url::{Host, Url};
|
||||
|
||||
use crate::error::FetchError;
|
||||
|
||||
/// Parse a caller-provided URL and require an HTTP(S) host.
|
||||
pub fn validate_http_url(raw: &str) -> Result<Url, FetchError> {
|
||||
let trimmed = raw.trim();
|
||||
if trimmed.is_empty() {
|
||||
return Err(FetchError::InvalidUrl("URL must not be empty".into()));
|
||||
}
|
||||
|
||||
let parsed =
|
||||
Url::parse(trimmed).map_err(|e| FetchError::InvalidUrl(format!("invalid URL: {e}")))?;
|
||||
match parsed.scheme() {
|
||||
"http" | "https" => {}
|
||||
scheme => {
|
||||
return Err(FetchError::InvalidUrl(format!(
|
||||
"scheme '{scheme}' is not allowed, use http:// or https://"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
if parsed.host().is_none() {
|
||||
return Err(FetchError::InvalidUrl("URL must include a host".into()));
|
||||
}
|
||||
|
||||
Ok(parsed)
|
||||
}
|
||||
|
||||
/// Parse, resolve, and reject private/internal destinations.
|
||||
///
|
||||
/// A domain is rejected if any resolved address is private or reserved.
|
||||
/// That is intentionally conservative: mixed public/private DNS answers
|
||||
/// are unsafe for server-side fetching.
|
||||
pub async fn validate_public_http_url(raw: &str) -> Result<Url, FetchError> {
|
||||
let parsed = validate_http_url(raw)?;
|
||||
validate_url_host_is_public(&parsed).await?;
|
||||
Ok(parsed)
|
||||
}
|
||||
|
||||
async fn validate_url_host_is_public(url: &Url) -> Result<(), FetchError> {
|
||||
match url.host() {
|
||||
Some(Host::Ipv4(ip)) => reject_blocked_ip(IpAddr::V4(ip)),
|
||||
Some(Host::Ipv6(ip)) => reject_blocked_ip(IpAddr::V6(ip)),
|
||||
Some(Host::Domain(host)) => {
|
||||
let port = url
|
||||
.port_or_known_default()
|
||||
.ok_or_else(|| FetchError::InvalidUrl("URL must include a known port".into()))?;
|
||||
let addrs = lookup_host((host, port))
|
||||
.await
|
||||
.map_err(|e| FetchError::InvalidUrl(format!("failed to resolve host: {e}")))?;
|
||||
|
||||
let mut resolved = false;
|
||||
for addr in addrs {
|
||||
resolved = true;
|
||||
reject_blocked_ip(addr.ip())?;
|
||||
}
|
||||
if !resolved {
|
||||
return Err(FetchError::InvalidUrl(
|
||||
"host did not resolve to any addresses".into(),
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
None => Err(FetchError::InvalidUrl("URL must include a host".into())),
|
||||
}
|
||||
}
|
||||
|
||||
fn reject_blocked_ip(ip: IpAddr) -> Result<(), FetchError> {
|
||||
if is_blocked_ip(ip) {
|
||||
Err(FetchError::InvalidUrl(
|
||||
"URL resolves to a blocked private or internal address".into(),
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Return true for IP ranges that should never be fetched server-side.
|
||||
pub fn is_blocked_ip(ip: IpAddr) -> bool {
|
||||
match ip {
|
||||
IpAddr::V4(ip) => is_blocked_ipv4(ip),
|
||||
IpAddr::V6(ip) => is_blocked_ipv6(ip),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_blocked_ipv4(ip: Ipv4Addr) -> bool {
|
||||
let o = ip.octets();
|
||||
|
||||
ip.is_unspecified()
|
||||
|| ip.is_loopback()
|
||||
|| ip.is_private()
|
||||
|| ip.is_link_local()
|
||||
|| o[0] == 0
|
||||
|| o[0] >= 224
|
||||
|| (o[0] == 100 && (64..=127).contains(&o[1]))
|
||||
|| (o[0] == 192 && o[1] == 0 && o[2] == 0)
|
||||
|| (o[0] == 192 && o[1] == 0 && o[2] == 2)
|
||||
|| (o[0] == 198 && (18..=19).contains(&o[1]))
|
||||
|| (o[0] == 198 && o[1] == 51 && o[2] == 100)
|
||||
|| (o[0] == 203 && o[1] == 0 && o[2] == 113)
|
||||
}
|
||||
|
||||
fn is_blocked_ipv6(ip: Ipv6Addr) -> bool {
|
||||
let s = ip.segments();
|
||||
|
||||
ip.is_unspecified()
|
||||
|| ip.is_loopback()
|
||||
|| ip.is_multicast()
|
||||
|| (s[0] & 0xfe00) == 0xfc00
|
||||
|| (s[0] & 0xffc0) == 0xfe80
|
||||
|| (s[0] == 0x0064 && s[1] == 0xff9b && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0)
|
||||
|| (s[0] == 0x2001 && s[1] == 0x0db8)
|
||||
|| embedded_ipv4(ip).is_some_and(is_blocked_ipv4)
|
||||
}
|
||||
|
||||
fn embedded_ipv4(ip: Ipv6Addr) -> Option<Ipv4Addr> {
|
||||
let s = ip.segments();
|
||||
|
||||
if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0xffff {
|
||||
return Some(Ipv4Addr::new(
|
||||
(s[6] >> 8) as u8,
|
||||
s[6] as u8,
|
||||
(s[7] >> 8) as u8,
|
||||
s[7] as u8,
|
||||
));
|
||||
}
|
||||
|
||||
if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0 {
|
||||
return Some(Ipv4Addr::new(
|
||||
(s[6] >> 8) as u8,
|
||||
s[6] as u8,
|
||||
(s[7] >> 8) as u8,
|
||||
s[7] as u8,
|
||||
));
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
|
||||
|
||||
use super::{is_blocked_ip, validate_public_http_url};
|
||||
|
||||
#[tokio::test]
|
||||
async fn blocks_ipv4_internal_ranges() {
|
||||
for ip in [
|
||||
Ipv4Addr::new(0, 0, 0, 0),
|
||||
Ipv4Addr::new(10, 0, 0, 1),
|
||||
Ipv4Addr::new(100, 64, 0, 1),
|
||||
Ipv4Addr::new(127, 0, 0, 1),
|
||||
Ipv4Addr::new(169, 254, 169, 254),
|
||||
Ipv4Addr::new(172, 16, 0, 1),
|
||||
Ipv4Addr::new(192, 168, 0, 1),
|
||||
Ipv4Addr::new(198, 18, 0, 1),
|
||||
] {
|
||||
let url = format!("http://{ip}/");
|
||||
assert!(validate_public_http_url(&url).await.is_err(), "{ip}");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn blocks_ipv6_internal_ranges() {
|
||||
for ip in [
|
||||
Ipv6Addr::LOCALHOST,
|
||||
Ipv6Addr::UNSPECIFIED,
|
||||
"fc00::1".parse().unwrap(),
|
||||
"fe80::1".parse().unwrap(),
|
||||
"64:ff9b::7f00:1".parse().unwrap(),
|
||||
"::ffff:127.0.0.1".parse().unwrap(),
|
||||
] {
|
||||
assert!(is_blocked_ip(IpAddr::V6(ip)), "{ip}");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn allows_public_ip_literals() {
|
||||
assert!(
|
||||
validate_public_http_url("https://93.184.216.34/")
|
||||
.await
|
||||
.is_ok()
|
||||
);
|
||||
assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue