fix(extractors): harden marketplace host matching

This commit is contained in:
Valerio 2026-05-12 12:03:43 +02:00
parent dbf9ce08a6
commit 307b4f980d
2 changed files with 80 additions and 20 deletions

View file

@ -30,6 +30,7 @@ use std::sync::OnceLock;
use regex::Regex; use regex::Regex;
use serde_json::{Value, json}; use serde_json::{Value, json};
use url::Url;
use super::ExtractorInfo; use super::ExtractorInfo;
use crate::cloud::{self, CloudError}; use crate::cloud::{self, CloudError};
@ -52,8 +53,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
}; };
pub fn matches(url: &str) -> bool { pub fn matches(url: &str) -> bool {
let host = host_of(url); let Some(host) = host_of(url) else {
if !is_amazon_host(host) { return false;
};
if !is_amazon_host(&host) {
return false; return false;
} }
parse_asin(url).is_some() parse_asin(url).is_some()
@ -162,17 +165,41 @@ pub fn parse(html: &str, url: &str, asin: &str) -> Value {
// URL helpers // URL helpers
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
fn host_of(url: &str) -> &str { fn host_of(url: &str) -> Option<String> {
url.split("://") let parsed = Url::parse(url).ok()?;
.nth(1) if !parsed.username().is_empty() || parsed.password().is_some() {
.unwrap_or(url) return None;
.split('/') }
.next() parsed.host_str().map(|host| host.to_ascii_lowercase())
.unwrap_or("")
} }
fn is_amazon_host(host: &str) -> bool { fn is_amazon_host(host: &str) -> bool {
host.starts_with("www.amazon.") || host.starts_with("amazon.") const AMAZON_HOSTS: &[&str] = &[
"amazon.ae",
"amazon.ca",
"amazon.cn",
"amazon.co.jp",
"amazon.co.uk",
"amazon.com",
"amazon.com.au",
"amazon.com.be",
"amazon.com.br",
"amazon.com.mx",
"amazon.com.tr",
"amazon.de",
"amazon.eg",
"amazon.es",
"amazon.fr",
"amazon.in",
"amazon.it",
"amazon.nl",
"amazon.pl",
"amazon.sa",
"amazon.se",
"amazon.sg",
];
let normalized = host.strip_prefix("www.").unwrap_or(host);
AMAZON_HOSTS.contains(&normalized)
} }
/// Pull a 10-char ASIN out of any recognised Amazon URL shape: /// Pull a 10-char ASIN out of any recognised Amazon URL shape:
@ -347,6 +374,9 @@ mod tests {
assert!(matches("https://www.amazon.com/dp/B0CHX1W1XY")); assert!(matches("https://www.amazon.com/dp/B0CHX1W1XY"));
assert!(matches("https://www.amazon.co.uk/dp/B0CHX1W1XY/")); assert!(matches("https://www.amazon.co.uk/dp/B0CHX1W1XY/"));
assert!(matches("https://www.amazon.de/dp/B0CHX1W1XY?psc=1")); assert!(matches("https://www.amazon.de/dp/B0CHX1W1XY?psc=1"));
assert!(matches("https://www.amazon.ca/dp/B0CHX1W1XY"));
assert!(matches("https://www.amazon.com.au/dp/B0CHX1W1XY"));
assert!(matches("https://www.amazon.in/dp/B0CHX1W1XY"));
assert!(matches( assert!(matches(
"https://www.amazon.com/gp/product/B0CHX1W1XY/ref=foo" "https://www.amazon.com/gp/product/B0CHX1W1XY/ref=foo"
)); ));
@ -357,6 +387,8 @@ mod tests {
assert!(!matches("https://www.amazon.com/")); assert!(!matches("https://www.amazon.com/"));
assert!(!matches("https://www.amazon.com/gp/cart")); assert!(!matches("https://www.amazon.com/gp/cart"));
assert!(!matches("https://example.com/dp/B0CHX1W1XY")); assert!(!matches("https://example.com/dp/B0CHX1W1XY"));
assert!(!matches("https://www.amazon.com@127.0.0.1/dp/B0CHX1W1XY"));
assert!(!matches("https://www.amazon.evil.com/dp/B0CHX1W1XY"));
} }
#[test] #[test]

View file

@ -12,6 +12,7 @@ use std::sync::OnceLock;
use regex::Regex; use regex::Regex;
use serde_json::{Value, json}; use serde_json::{Value, json};
use url::Url;
use super::ExtractorInfo; use super::ExtractorInfo;
use crate::cloud::{self, CloudError}; use crate::cloud::{self, CloudError};
@ -32,8 +33,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
}; };
pub fn matches(url: &str) -> bool { pub fn matches(url: &str) -> bool {
let host = host_of(url); let Some(host) = host_of(url) else {
if !is_ebay_host(host) { return false;
};
if !is_ebay_host(&host) {
return false; return false;
} }
parse_item_id(url).is_some() parse_item_id(url).is_some()
@ -120,17 +123,37 @@ pub fn parse(html: &str, url: &str, item_id: &str) -> Value {
// URL helpers // URL helpers
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
fn host_of(url: &str) -> &str { fn host_of(url: &str) -> Option<String> {
url.split("://") let parsed = Url::parse(url).ok()?;
.nth(1) if !parsed.username().is_empty() || parsed.password().is_some() {
.unwrap_or(url) return None;
.split('/') }
.next() parsed.host_str().map(|host| host.to_ascii_lowercase())
.unwrap_or("")
} }
fn is_ebay_host(host: &str) -> bool { fn is_ebay_host(host: &str) -> bool {
host.starts_with("www.ebay.") || host.starts_with("ebay.") const EBAY_HOSTS: &[&str] = &[
"ebay.at",
"ebay.be",
"ebay.ca",
"ebay.ch",
"ebay.co.uk",
"ebay.com",
"ebay.com.au",
"ebay.com.hk",
"ebay.com.my",
"ebay.com.sg",
"ebay.de",
"ebay.es",
"ebay.fr",
"ebay.ie",
"ebay.it",
"ebay.nl",
"ebay.ph",
"ebay.pl",
];
let normalized = host.strip_prefix("www.").unwrap_or(host);
EBAY_HOSTS.contains(&normalized)
} }
/// Pull the numeric item id out of `/itm/{id}` or `/itm/{slug}/{id}` /// Pull the numeric item id out of `/itm/{id}` or `/itm/{slug}/{id}`
@ -273,9 +296,14 @@ mod tests {
"https://www.ebay.com/itm/vintage-typewriter/325478156234" "https://www.ebay.com/itm/vintage-typewriter/325478156234"
)); ));
assert!(matches("https://www.ebay.co.uk/itm/325478156234")); assert!(matches("https://www.ebay.co.uk/itm/325478156234"));
assert!(matches("https://www.ebay.ca/itm/325478156234"));
assert!(matches("https://www.ebay.com.au/itm/325478156234"));
assert!(matches("https://www.ebay.es/itm/325478156234"));
assert!(!matches("https://www.ebay.com/")); assert!(!matches("https://www.ebay.com/"));
assert!(!matches("https://www.ebay.com/sch/foo")); assert!(!matches("https://www.ebay.com/sch/foo"));
assert!(!matches("https://example.com/itm/325478156234")); assert!(!matches("https://example.com/itm/325478156234"));
assert!(!matches("https://www.ebay.com@127.0.0.1/itm/325478156234"));
assert!(!matches("https://www.ebay.attacker.com/itm/325478156234"));
} }
#[test] #[test]