From 7de5526ac11b2d7321c33b0825b5e41f1cc0835d Mon Sep 17 00:00:00 2001 From: Valerio <88933932+0xMassi@users.noreply.github.com> Date: Tue, 12 May 2026 11:46:58 +0200 Subject: [PATCH] fix(extractors): harden amazon/ebay host validation --- .../src/extractors/amazon_product.rs | 40 ++++++++++++++----- .../src/extractors/ebay_listing.rs | 36 ++++++++++++----- 2 files changed, 56 insertions(+), 20 deletions(-) diff --git a/crates/webclaw-fetch/src/extractors/amazon_product.rs b/crates/webclaw-fetch/src/extractors/amazon_product.rs index fed6b9f..0529d93 100644 --- a/crates/webclaw-fetch/src/extractors/amazon_product.rs +++ b/crates/webclaw-fetch/src/extractors/amazon_product.rs @@ -30,6 +30,7 @@ use std::sync::OnceLock; use regex::Regex; use serde_json::{Value, json}; +use url::Url; use super::ExtractorInfo; use crate::cloud::{self, CloudError}; @@ -52,8 +53,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - let host = host_of(url); - if !is_amazon_host(host) { + let Some(host) = host_of(url) else { + return false; + }; + if !is_amazon_host(&host) { return false; } parse_asin(url).is_some() @@ -162,17 +165,32 @@ pub fn parse(html: &str, url: &str, asin: &str) -> Value { // URL helpers // --------------------------------------------------------------------------- -fn host_of(url: &str) -> &str { - url.split("://") - .nth(1) - .unwrap_or(url) - .split('/') - .next() - .unwrap_or("") +fn host_of(url: &str) -> Option { + let parsed = Url::parse(url).ok()?; + if !parsed.username().is_empty() || parsed.password().is_some() { + return None; + } + parsed.host_str().map(str::to_string) } fn is_amazon_host(host: &str) -> bool { - host.starts_with("www.amazon.") || host.starts_with("amazon.") + matches!( + host, + "amazon.com" + | "www.amazon.com" + | "amazon.co.uk" + | "www.amazon.co.uk" + | "amazon.de" + | "www.amazon.de" + | "amazon.fr" + | "www.amazon.fr" + | "amazon.it" + | "www.amazon.it" + | "amazon.es" + | "www.amazon.es" + | "amazon.co.jp" + | "www.amazon.co.jp" + ) } /// Pull a 10-char ASIN out of any recognised Amazon URL shape: @@ -357,6 +375,8 @@ mod tests { assert!(!matches("https://www.amazon.com/")); assert!(!matches("https://www.amazon.com/gp/cart")); assert!(!matches("https://example.com/dp/B0CHX1W1XY")); + assert!(!matches("https://www.amazon.com@127.0.0.1/dp/B0CHX1W1XY")); + assert!(!matches("https://www.amazon.evil.com/dp/B0CHX1W1XY")); } #[test] diff --git a/crates/webclaw-fetch/src/extractors/ebay_listing.rs b/crates/webclaw-fetch/src/extractors/ebay_listing.rs index dbc85ab..5e1f779 100644 --- a/crates/webclaw-fetch/src/extractors/ebay_listing.rs +++ b/crates/webclaw-fetch/src/extractors/ebay_listing.rs @@ -12,6 +12,7 @@ use std::sync::OnceLock; use regex::Regex; use serde_json::{Value, json}; +use url::Url; use super::ExtractorInfo; use crate::cloud::{self, CloudError}; @@ -32,8 +33,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - let host = host_of(url); - if !is_ebay_host(host) { + let Some(host) = host_of(url) else { + return false; + }; + if !is_ebay_host(&host) { return false; } parse_item_id(url).is_some() @@ -120,17 +123,28 @@ pub fn parse(html: &str, url: &str, item_id: &str) -> Value { // URL helpers // --------------------------------------------------------------------------- -fn host_of(url: &str) -> &str { - url.split("://") - .nth(1) - .unwrap_or(url) - .split('/') - .next() - .unwrap_or("") +fn host_of(url: &str) -> Option { + let parsed = Url::parse(url).ok()?; + if !parsed.username().is_empty() || parsed.password().is_some() { + return None; + } + parsed.host_str().map(str::to_string) } fn is_ebay_host(host: &str) -> bool { - host.starts_with("www.ebay.") || host.starts_with("ebay.") + matches!( + host, + "ebay.com" + | "www.ebay.com" + | "ebay.co.uk" + | "www.ebay.co.uk" + | "ebay.de" + | "www.ebay.de" + | "ebay.fr" + | "www.ebay.fr" + | "ebay.it" + | "www.ebay.it" + ) } /// Pull the numeric item id out of `/itm/{id}` or `/itm/{slug}/{id}` @@ -276,6 +290,8 @@ mod tests { assert!(!matches("https://www.ebay.com/")); assert!(!matches("https://www.ebay.com/sch/foo")); assert!(!matches("https://example.com/itm/325478156234")); + assert!(!matches("https://www.ebay.com@127.0.0.1/itm/325478156234")); + assert!(!matches("https://www.ebay.attacker.com/itm/325478156234")); } #[test]