diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b2ea54a..87b13a0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -5,14 +5,15 @@ on: tags: ["v*"] permissions: - contents: write - packages: write + contents: read env: CARGO_TERM_COLOR: always jobs: build: + permissions: + contents: read name: Build ${{ matrix.target }} runs-on: ${{ matrix.os }} strategy: @@ -106,9 +107,9 @@ jobs: name: Release needs: build runs-on: ubuntu-latest + permissions: + contents: write steps: - - uses: actions/checkout@v4 - - uses: actions/download-artifact@v4 with: path: artifacts @@ -122,18 +123,23 @@ jobs: cat SHA256SUMS - name: Create GitHub Release - uses: softprops/action-gh-release@v2 - with: - generate_release_notes: true - files: | - artifacts/*.tar.gz - artifacts/*.zip - artifacts/SHA256SUMS + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + tag="${GITHUB_REF#refs/tags/}" + gh release create "$tag" \ + artifacts/*.tar.gz \ + artifacts/*.zip \ + artifacts/SHA256SUMS \ + --generate-notes docker: name: Docker needs: release runs-on: ubuntu-latest + permissions: + contents: read + packages: write steps: - uses: actions/checkout@v4 @@ -193,6 +199,8 @@ jobs: name: Update Homebrew needs: [release, docker] runs-on: ubuntu-latest + permissions: + contents: read steps: - name: Compute all checksums and update formula env: diff --git a/CHANGELOG.md b/CHANGELOG.md index 025b1db..eb1a2ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,17 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.6.1] — 2026-05-12 + +### Fixed +- Hardened URL safety across the CLI, MCP server, and self-hosted API paths so local and private network targets are rejected more consistently, including after DNS resolution and redirects. +- Added a timeout around inline JavaScript data extraction so hostile pages cannot keep the extractor busy forever. +- Tightened Amazon and eBay URL recognition so deceptive hosts are rejected while common international marketplaces still work. +- Avoided unnecessary decoding work on large responses during bot-challenge detection. +- Reduced release workflow token permissions so build jobs run with narrower GitHub access. + +--- + ## [0.6.0] — 2026-05-10 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index ab23a3f..5b96a0b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3219,7 +3219,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.6.0" +version = "0.6.1" dependencies = [ "clap", "dotenvy", @@ -3240,7 +3240,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.6.0" +version = "0.6.1" dependencies = [ "ego-tree", "once_cell", @@ -3258,7 +3258,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.6.0" +version = "0.6.1" dependencies = [ "async-trait", "bytes", @@ -3284,7 +3284,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.6.0" +version = "0.6.1" dependencies = [ "async-trait", "reqwest", @@ -3297,7 +3297,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.6.0" +version = "0.6.1" dependencies = [ "dirs", "dotenvy", @@ -3317,7 +3317,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.6.0" +version = "0.6.1" dependencies = [ "pdf-extract", "thiserror", @@ -3326,7 +3326,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.6.0" +version = "0.6.1" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index 6e87225..72da7c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.6.0" +version = "0.6.1" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index a45bce8..03c1490 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -849,11 +849,18 @@ async fn enrich_html_with_stylesheets(html: &str, base_url: &str) -> String { let client = reqwest::Client::builder() .timeout(std::time::Duration::from_secs(5)) + .redirect(reqwest::redirect::Policy::none()) .build() .unwrap_or_default(); let mut extra_css = String::new(); for href in &hrefs { + if webclaw_fetch::url_security::validate_public_http_url(href) + .await + .is_err() + { + continue; + } if let Ok(resp) = client.get(href).send().await && resp.status().is_success() && let Ok(body) = resp.text().await diff --git a/crates/webclaw-core/src/js_eval.rs b/crates/webclaw-core/src/js_eval.rs index 213b1e5..e1fb2de 100644 --- a/crates/webclaw-core/src/js_eval.rs +++ b/crates/webclaw-core/src/js_eval.rs @@ -9,10 +9,12 @@ use once_cell::sync::Lazy; use regex::Regex; use rquickjs::{Context, Runtime}; use scraper::{Html, Selector}; +use std::time::{Duration, Instant}; use tracing::debug; static SCRIPT_SELECTOR: Lazy = Lazy::new(|| Selector::parse("script").unwrap()); static HTML_TAG_RE: Lazy = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap()); +const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250); /// A blob of data extracted from JS execution. pub struct JsDataBlob { @@ -49,6 +51,8 @@ pub fn extract_js_data(html: &str) -> Vec { let rt = Runtime::new().expect("QuickJS runtime creation failed"); rt.set_memory_limit(64 * 1024 * 1024); // 64 MB rt.set_max_stack_size(1024 * 1024); // 1 MB + let deadline = Instant::now() + JS_EVAL_TIMEOUT; + rt.set_interrupt_handler(Some(Box::new(move || Instant::now() >= deadline))); let ctx = Context::full(&rt).expect("QuickJS context creation failed"); @@ -464,6 +468,8 @@ fn walk_rsc_tree(value: &serde_json::Value, out: &mut Vec, depth: usize) #[cfg(test)] mod tests { + use std::time::{Duration, Instant}; + use super::*; #[test] @@ -493,6 +499,29 @@ mod tests { ); } + #[test] + fn js_eval_interrupts_infinite_loops() { + let html = r#" + + + + + hello + + "#; + + let start = Instant::now(); + let blobs = extract_js_data(html); + + assert!(blobs.is_empty()); + assert!( + start.elapsed() < Duration::from_secs(2), + "QuickJS execution should be interrupted quickly" + ); + } + #[test] fn skips_external_and_module_scripts() { let html = r#" diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index 4fff454..810c450 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -783,6 +783,10 @@ fn is_pdf_content_type(headers: &http::HeaderMap) -> bool { /// Detect if a response looks like a bot protection challenge page. fn is_challenge_response(response: &Response) -> bool { + let body_len = response.body().len(); + if body_len > 15_000 || body_len == 0 { + return false; + } is_challenge_html(response.text().as_ref()) } diff --git a/crates/webclaw-fetch/src/extractors/amazon_product.rs b/crates/webclaw-fetch/src/extractors/amazon_product.rs index fed6b9f..e374b75 100644 --- a/crates/webclaw-fetch/src/extractors/amazon_product.rs +++ b/crates/webclaw-fetch/src/extractors/amazon_product.rs @@ -30,6 +30,7 @@ use std::sync::OnceLock; use regex::Regex; use serde_json::{Value, json}; +use url::Url; use super::ExtractorInfo; use crate::cloud::{self, CloudError}; @@ -52,8 +53,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - let host = host_of(url); - if !is_amazon_host(host) { + let Some(host) = host_of(url) else { + return false; + }; + if !is_amazon_host(&host) { return false; } parse_asin(url).is_some() @@ -162,17 +165,41 @@ pub fn parse(html: &str, url: &str, asin: &str) -> Value { // URL helpers // --------------------------------------------------------------------------- -fn host_of(url: &str) -> &str { - url.split("://") - .nth(1) - .unwrap_or(url) - .split('/') - .next() - .unwrap_or("") +fn host_of(url: &str) -> Option { + let parsed = Url::parse(url).ok()?; + if !parsed.username().is_empty() || parsed.password().is_some() { + return None; + } + parsed.host_str().map(|host| host.to_ascii_lowercase()) } fn is_amazon_host(host: &str) -> bool { - host.starts_with("www.amazon.") || host.starts_with("amazon.") + const AMAZON_HOSTS: &[&str] = &[ + "amazon.ae", + "amazon.ca", + "amazon.cn", + "amazon.co.jp", + "amazon.co.uk", + "amazon.com", + "amazon.com.au", + "amazon.com.be", + "amazon.com.br", + "amazon.com.mx", + "amazon.com.tr", + "amazon.de", + "amazon.eg", + "amazon.es", + "amazon.fr", + "amazon.in", + "amazon.it", + "amazon.nl", + "amazon.pl", + "amazon.sa", + "amazon.se", + "amazon.sg", + ]; + let normalized = host.strip_prefix("www.").unwrap_or(host); + AMAZON_HOSTS.contains(&normalized) } /// Pull a 10-char ASIN out of any recognised Amazon URL shape: @@ -347,6 +374,9 @@ mod tests { assert!(matches("https://www.amazon.com/dp/B0CHX1W1XY")); assert!(matches("https://www.amazon.co.uk/dp/B0CHX1W1XY/")); assert!(matches("https://www.amazon.de/dp/B0CHX1W1XY?psc=1")); + assert!(matches("https://www.amazon.ca/dp/B0CHX1W1XY")); + assert!(matches("https://www.amazon.com.au/dp/B0CHX1W1XY")); + assert!(matches("https://www.amazon.in/dp/B0CHX1W1XY")); assert!(matches( "https://www.amazon.com/gp/product/B0CHX1W1XY/ref=foo" )); @@ -357,6 +387,8 @@ mod tests { assert!(!matches("https://www.amazon.com/")); assert!(!matches("https://www.amazon.com/gp/cart")); assert!(!matches("https://example.com/dp/B0CHX1W1XY")); + assert!(!matches("https://www.amazon.com@127.0.0.1/dp/B0CHX1W1XY")); + assert!(!matches("https://www.amazon.evil.com/dp/B0CHX1W1XY")); } #[test] diff --git a/crates/webclaw-fetch/src/extractors/ebay_listing.rs b/crates/webclaw-fetch/src/extractors/ebay_listing.rs index dbc85ab..36f18e9 100644 --- a/crates/webclaw-fetch/src/extractors/ebay_listing.rs +++ b/crates/webclaw-fetch/src/extractors/ebay_listing.rs @@ -12,6 +12,7 @@ use std::sync::OnceLock; use regex::Regex; use serde_json::{Value, json}; +use url::Url; use super::ExtractorInfo; use crate::cloud::{self, CloudError}; @@ -32,8 +33,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - let host = host_of(url); - if !is_ebay_host(host) { + let Some(host) = host_of(url) else { + return false; + }; + if !is_ebay_host(&host) { return false; } parse_item_id(url).is_some() @@ -120,17 +123,37 @@ pub fn parse(html: &str, url: &str, item_id: &str) -> Value { // URL helpers // --------------------------------------------------------------------------- -fn host_of(url: &str) -> &str { - url.split("://") - .nth(1) - .unwrap_or(url) - .split('/') - .next() - .unwrap_or("") +fn host_of(url: &str) -> Option { + let parsed = Url::parse(url).ok()?; + if !parsed.username().is_empty() || parsed.password().is_some() { + return None; + } + parsed.host_str().map(|host| host.to_ascii_lowercase()) } fn is_ebay_host(host: &str) -> bool { - host.starts_with("www.ebay.") || host.starts_with("ebay.") + const EBAY_HOSTS: &[&str] = &[ + "ebay.at", + "ebay.be", + "ebay.ca", + "ebay.ch", + "ebay.co.uk", + "ebay.com", + "ebay.com.au", + "ebay.com.hk", + "ebay.com.my", + "ebay.com.sg", + "ebay.de", + "ebay.es", + "ebay.fr", + "ebay.ie", + "ebay.it", + "ebay.nl", + "ebay.ph", + "ebay.pl", + ]; + let normalized = host.strip_prefix("www.").unwrap_or(host); + EBAY_HOSTS.contains(&normalized) } /// Pull the numeric item id out of `/itm/{id}` or `/itm/{slug}/{id}` @@ -273,9 +296,14 @@ mod tests { "https://www.ebay.com/itm/vintage-typewriter/325478156234" )); assert!(matches("https://www.ebay.co.uk/itm/325478156234")); + assert!(matches("https://www.ebay.ca/itm/325478156234")); + assert!(matches("https://www.ebay.com.au/itm/325478156234")); + assert!(matches("https://www.ebay.es/itm/325478156234")); assert!(!matches("https://www.ebay.com/")); assert!(!matches("https://www.ebay.com/sch/foo")); assert!(!matches("https://example.com/itm/325478156234")); + assert!(!matches("https://www.ebay.com@127.0.0.1/itm/325478156234")); + assert!(!matches("https://www.ebay.attacker.com/itm/325478156234")); } #[test] diff --git a/crates/webclaw-fetch/src/tls.rs b/crates/webclaw-fetch/src/tls.rs index fdaeb0b..0479e77 100644 --- a/crates/webclaw-fetch/src/tls.rs +++ b/crates/webclaw-fetch/src/tls.rs @@ -5,9 +5,7 @@ //! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order, //! stream dependency, priorities) to match real browser fingerprints. -use std::time::Duration; - -use std::borrow::Cow; +use std::{borrow::Cow, io, time::Duration}; use wreq::http2::{ Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId, @@ -21,6 +19,41 @@ use wreq::{Client, Emulation}; use crate::browser::BrowserVariant; use crate::error::FetchError; +#[derive(Clone, Default)] +struct PublicDnsResolver; + +impl wreq::dns::Resolve for PublicDnsResolver { + fn resolve(&self, name: wreq::dns::Name) -> wreq::dns::Resolving { + Box::pin(async move { + let addrs = tokio::net::lookup_host((name.as_str(), 0)) + .await + .map_err(|e| Box::new(e) as Box)?; + let mut public = Vec::new(); + + for addr in addrs { + if crate::url_security::is_blocked_ip(addr.ip()) { + let err: Box = Box::new(io::Error::new( + io::ErrorKind::PermissionDenied, + "DNS resolved to a blocked private or internal address", + )); + return Err(err); + } + public.push(addr); + } + + if public.is_empty() { + let err: Box = Box::new(io::Error::new( + io::ErrorKind::NotFound, + "host did not resolve to any addresses", + )); + return Err(err); + } + + Ok(Box::new(public.into_iter()) as wreq::dns::Addrs) + }) + } +} + /// Chrome cipher list (TLS 1.3 + TLS 1.2 in Chrome's exact order). const CHROME_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA"; @@ -503,6 +536,8 @@ pub fn build_client( let proxy = wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?; builder = builder.proxy(proxy); + } else { + builder = builder.dns_resolver(PublicDnsResolver); } builder diff --git a/crates/webclaw-fetch/src/url_security.rs b/crates/webclaw-fetch/src/url_security.rs index 1d2b534..328879e 100644 --- a/crates/webclaw-fetch/src/url_security.rs +++ b/crates/webclaw-fetch/src/url_security.rs @@ -163,7 +163,9 @@ mod tests { Ipv4Addr::new(169, 254, 169, 254), Ipv4Addr::new(172, 16, 0, 1), Ipv4Addr::new(192, 168, 0, 1), + Ipv4Addr::new(192, 0, 0, 8), Ipv4Addr::new(198, 18, 0, 1), + Ipv4Addr::new(255, 255, 255, 255), ] { let url = format!("http://{ip}/"); assert!(validate_public_http_url(&url).await.is_err(), "{ip}"); @@ -193,4 +195,9 @@ mod tests { ); assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false); } + + #[tokio::test] + async fn blocks_localhost_domains_after_resolution() { + assert!(validate_public_http_url("http://localhost/").await.is_err()); + } } diff --git a/crates/webclaw-mcp/src/server.rs b/crates/webclaw-mcp/src/server.rs index d56032d..3b88bab 100644 --- a/crates/webclaw-mcp/src/server.rs +++ b/crates/webclaw-mcp/src/server.rs @@ -51,9 +51,10 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile { } } -/// Validate that a URL is non-empty and has an http or https scheme. -fn validate_url(url: &str) -> Result<(), String> { - webclaw_fetch::url_security::validate_http_url(url) +/// Validate that a URL is public HTTP(S), matching the fetch-layer SSRF guard. +async fn validate_url(url: &str) -> Result<(), String> { + webclaw_fetch::url_security::validate_public_http_url(url) + .await .map(|_| ()) .map_err(|e| format!("Invalid URL: {e}")) } @@ -161,7 +162,7 @@ impl WebclawMcp { /// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected. #[tool] async fn scrape(&self, Parameters(params): Parameters) -> Result { - validate_url(¶ms.url)?; + validate_url(¶ms.url).await?; let format = params.format.as_deref().unwrap_or("markdown"); let browser = parse_browser(params.browser.as_deref()); let include = params.include_selectors.unwrap_or_default(); @@ -251,7 +252,7 @@ impl WebclawMcp { /// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit. #[tool] async fn crawl(&self, Parameters(params): Parameters) -> Result { - validate_url(¶ms.url)?; + validate_url(¶ms.url).await?; if let Some(max) = params.max_pages && max > 500 @@ -300,7 +301,7 @@ impl WebclawMcp { /// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml). #[tool] async fn map(&self, Parameters(params): Parameters) -> Result { - validate_url(¶ms.url)?; + validate_url(¶ms.url).await?; let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, ¶ms.url) .await .map_err(|e| format!("Sitemap discovery failed: {e}"))?; @@ -323,7 +324,7 @@ impl WebclawMcp { return Err("batch is limited to 100 URLs per request".into()); } for u in ¶ms.urls { - validate_url(u)?; + validate_url(u).await?; } let format = params.format.as_deref().unwrap_or("markdown"); @@ -365,7 +366,7 @@ impl WebclawMcp { &self, Parameters(params): Parameters, ) -> Result { - validate_url(¶ms.url)?; + validate_url(¶ms.url).await?; if params.schema.is_none() && params.prompt.is_none() { return Err("Either 'schema' or 'prompt' is required for extraction.".into()); @@ -422,7 +423,7 @@ impl WebclawMcp { &self, Parameters(params): Parameters, ) -> Result { - validate_url(¶ms.url)?; + validate_url(¶ms.url).await?; // No local LLM — fall back to cloud API directly if self.llm_chain.is_none() { @@ -464,7 +465,7 @@ impl WebclawMcp { /// Automatically falls back to the webclaw cloud API when bot protection is detected. #[tool] async fn diff(&self, Parameters(params): Parameters) -> Result { - validate_url(¶ms.url)?; + validate_url(¶ms.url).await?; let previous: webclaw_core::ExtractionResult = serde_json::from_str(¶ms.previous_snapshot) .map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?; @@ -532,7 +533,7 @@ impl WebclawMcp { /// Automatically falls back to the webclaw cloud API when bot protection is detected. #[tool] async fn brand(&self, Parameters(params): Parameters) -> Result { - validate_url(¶ms.url)?; + validate_url(¶ms.url).await?; let fetch_result = tokio::time::timeout(LOCAL_FETCH_TIMEOUT, self.fetch_client.fetch(¶ms.url)) .await @@ -737,7 +738,7 @@ impl WebclawMcp { &self, Parameters(params): Parameters, ) -> Result { - validate_url(¶ms.url)?; + validate_url(¶ms.url).await?; // Use the cached Firefox client, not the default Chrome one. // Reddit's `.json` endpoint rejects the wreq-Chrome TLS // fingerprint with a 403 even from residential IPs (they