feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-04-14 19:33:06 +02:00
parent a4c351d5ae
commit 050b2ef463
7 changed files with 109 additions and 17 deletions

1
.gitignore vendored
View file

@ -3,3 +3,4 @@ target/
.env .env
proxies.txt proxies.txt
.claude/skills/ .claude/skills/
*.json

View file

@ -3,6 +3,13 @@
All notable changes to webclaw are documented here. All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/). Format follows [Keep a Changelog](https://keepachangelog.com/).
## [0.3.12] — 2026-04-10
### Added
- **Crawl scope control**: new `allow_subdomains` and `allow_external_links` fields on `CrawlConfig`. By default crawls stay same-origin. Enable `allow_subdomains` to follow sibling/child subdomains (e.g. blog.example.com from example.com), or `allow_external_links` for full cross-origin crawling. Root domain extraction uses a heuristic that handles two-part TLDs (co.uk, com.au).
---
## [0.3.11] — 2026-04-10 ## [0.3.11] — 2026-04-10
### Added ### Added

12
Cargo.lock generated
View file

@ -3102,7 +3102,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-cli" name = "webclaw-cli"
version = "0.3.11" version = "0.3.12"
dependencies = [ dependencies = [
"clap", "clap",
"dotenvy", "dotenvy",
@ -3122,7 +3122,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-core" name = "webclaw-core"
version = "0.3.11" version = "0.3.12"
dependencies = [ dependencies = [
"ego-tree", "ego-tree",
"once_cell", "once_cell",
@ -3140,7 +3140,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-fetch" name = "webclaw-fetch"
version = "0.3.11" version = "0.3.12"
dependencies = [ dependencies = [
"bytes", "bytes",
"calamine", "calamine",
@ -3162,7 +3162,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-llm" name = "webclaw-llm"
version = "0.3.11" version = "0.3.12"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"reqwest", "reqwest",
@ -3175,7 +3175,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-mcp" name = "webclaw-mcp"
version = "0.3.11" version = "0.3.12"
dependencies = [ dependencies = [
"dirs", "dirs",
"dotenvy", "dotenvy",
@ -3196,7 +3196,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-pdf" name = "webclaw-pdf"
version = "0.3.11" version = "0.3.12"
dependencies = [ dependencies = [
"pdf-extract", "pdf-extract",
"thiserror", "thiserror",

View file

@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"] members = ["crates/*"]
[workspace.package] [workspace.package]
version = "0.3.11" version = "0.3.12"
edition = "2024" edition = "2024"
license = "AGPL-3.0" license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw" repository = "https://github.com/0xMassi/webclaw"

View file

@ -1218,6 +1218,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
exclude_patterns, exclude_patterns,
progress_tx: Some(progress_tx), progress_tx: Some(progress_tx),
cancel_flag: Some(Arc::clone(&cancel_flag)), cancel_flag: Some(Arc::clone(&cancel_flag)),
allow_subdomains: false,
allow_external_links: false,
}; };
// Load resume state if --crawl-state file exists // Load resume state if --crawl-state file exists

View file

@ -1,9 +1,13 @@
/// Recursive same-origin web crawler built on top of [`FetchClient`]. /// Recursive web crawler built on top of [`FetchClient`].
/// ///
/// Starts from a seed URL, extracts content, discovers links, and follows /// Starts from a seed URL, extracts content, discovers links, and follows
/// them breadth-first up to a configurable depth/page limit. Uses a semaphore /// them breadth-first up to a configurable depth/page limit. Uses a semaphore
/// for bounded concurrency and per-request delays for politeness. /// for bounded concurrency and per-request delays for politeness.
/// ///
/// Scope control: by default only same-origin links are followed. Enable
/// `allow_subdomains` to include sibling/child subdomains of the seed host,
/// or `allow_external_links` to follow links to any domain.
///
/// When `use_sitemap` is enabled, the crawler first discovers URLs from the /// When `use_sitemap` is enabled, the crawler first discovers URLs from the
/// site's sitemaps and seeds the BFS frontier before crawling. /// site's sitemaps and seeds the BFS frontier before crawling.
use std::collections::HashSet; use std::collections::HashSet;
@ -39,11 +43,17 @@ pub struct CrawlConfig {
/// Seed BFS frontier from sitemap discovery before crawling. /// Seed BFS frontier from sitemap discovery before crawling.
pub use_sitemap: bool, pub use_sitemap: bool,
/// Glob patterns for paths to include. If non-empty, only matching URLs are crawled. /// Glob patterns for paths to include. If non-empty, only matching URLs are crawled.
/// E.g. `["/api/*", "/guides/*"]` matched against the URL path. /// E.g. `["/api/*", "/guides/*"]` -- matched against the URL path.
pub include_patterns: Vec<String>, pub include_patterns: Vec<String>,
/// Glob patterns for paths to exclude. Checked after include_patterns. /// Glob patterns for paths to exclude. Checked after include_patterns.
/// E.g. `["/changelog/*", "/blog/*"]` matching URLs are skipped. /// E.g. `["/changelog/*", "/blog/*"]` -- matching URLs are skipped.
pub exclude_patterns: Vec<String>, pub exclude_patterns: Vec<String>,
/// Follow links on subdomains of the seed domain (e.g. blog.example.com
/// when crawling example.com). Default: false (same-origin only).
pub allow_subdomains: bool,
/// Follow links to entirely different domains. Default: false.
/// When true, the crawler becomes cross-origin. Use with caution.
pub allow_external_links: bool,
/// Optional channel sender for streaming per-page results as they complete. /// Optional channel sender for streaming per-page results as they complete.
/// When set, each `PageResult` is sent on this channel immediately after extraction. /// When set, each `PageResult` is sent on this channel immediately after extraction.
pub progress_tx: Option<tokio::sync::broadcast::Sender<PageResult>>, pub progress_tx: Option<tokio::sync::broadcast::Sender<PageResult>>,
@ -64,6 +74,8 @@ impl Default for CrawlConfig {
use_sitemap: false, use_sitemap: false,
include_patterns: Vec::new(), include_patterns: Vec::new(),
exclude_patterns: Vec::new(), exclude_patterns: Vec::new(),
allow_subdomains: false,
allow_external_links: false,
progress_tx: None, progress_tx: None,
cancel_flag: None, cancel_flag: None,
} }
@ -113,6 +125,8 @@ pub struct Crawler {
client: Arc<FetchClient>, client: Arc<FetchClient>,
config: CrawlConfig, config: CrawlConfig,
seed_origin: String, seed_origin: String,
/// Root domain of the seed URL for subdomain matching (e.g. "example.com").
seed_root_domain: String,
} }
impl Crawler { impl Crawler {
@ -121,6 +135,7 @@ impl Crawler {
pub fn new(seed_url: &str, config: CrawlConfig) -> Result<Self, FetchError> { pub fn new(seed_url: &str, config: CrawlConfig) -> Result<Self, FetchError> {
let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?; let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?;
let seed_origin = origin_key(&seed); let seed_origin = origin_key(&seed);
let seed_root_domain = root_domain(&seed);
let client = FetchClient::new(config.fetch.clone())?; let client = FetchClient::new(config.fetch.clone())?;
@ -128,6 +143,7 @@ impl Crawler {
client: Arc::new(client), client: Arc::new(client),
config, config,
seed_origin, seed_origin,
seed_root_domain,
}) })
} }
@ -278,7 +294,7 @@ impl Crawler {
let delay = self.config.delay; let delay = self.config.delay;
handles.push(tokio::spawn(async move { handles.push(tokio::spawn(async move {
// Acquire permit blocks if concurrency limit reached // Acquire permit -- blocks if concurrency limit reached
let _permit = permit.acquire().await.expect("semaphore closed"); let _permit = permit.acquire().await.expect("semaphore closed");
tokio::time::sleep(delay).await; tokio::time::sleep(delay).await;
@ -392,10 +408,21 @@ impl Crawler {
_ => return None, _ => return None,
} }
// Same-origin check (scheme + host + port) // Scope check: same-origin, subdomain, or external
if origin_key(&parsed) != self.seed_origin { if !self.config.allow_external_links {
let link_origin = origin_key(&parsed);
if link_origin != self.seed_origin {
// Not same-origin. Check if subdomain crawling is allowed.
if self.config.allow_subdomains {
let link_root = root_domain(&parsed);
if link_root != self.seed_root_domain {
return None; return None;
} }
} else {
return None;
}
}
}
// Path prefix filter // Path prefix filter
if let Some(ref prefix) = self.config.path_prefix if let Some(ref prefix) = self.config.path_prefix
@ -457,6 +484,29 @@ fn origin_key(url: &Url) -> String {
format!("{}://{}{}", url.scheme(), host, port_suffix) format!("{}://{}{}", url.scheme(), host, port_suffix)
} }
/// Extract the root domain from a URL for subdomain comparison.
/// "blog.docs.example.com" -> "example.com", "example.co.uk" -> "example.co.uk" (best-effort).
///
/// Uses a simple heuristic: take the last two labels, or three if the second-to-last
/// is short (<=3 chars, likely a country SLD like "co.uk", "com.au").
fn root_domain(url: &Url) -> String {
let host = url.host_str().unwrap_or("");
let host = host.strip_prefix("www.").unwrap_or(host);
let labels: Vec<&str> = host.split('.').collect();
if labels.len() <= 2 {
return host.to_ascii_lowercase();
}
// Heuristic for two-part TLDs (co.uk, com.au, org.br, etc.)
let sld = labels[labels.len() - 2];
if labels.len() >= 3 && sld.len() <= 3 {
labels[labels.len() - 3..].join(".").to_ascii_lowercase()
} else {
labels[labels.len() - 2..].join(".").to_ascii_lowercase()
}
}
/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"), /// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
/// lowercase scheme + host. Preserves query params and path case. /// lowercase scheme + host. Preserves query params and path case.
fn normalize(url: &Url) -> String { fn normalize(url: &Url) -> String {
@ -502,7 +552,7 @@ fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
while ti < text.len() { while ti < text.len() {
if pi < pat.len() && pat[pi] == b'*' && pi + 1 < pat.len() && pat[pi + 1] == b'*' { if pi < pat.len() && pat[pi] == b'*' && pi + 1 < pat.len() && pat[pi + 1] == b'*' {
// `**` match everything including slashes // `**` -- match everything including slashes
// Skip all consecutive `*` // Skip all consecutive `*`
while pi < pat.len() && pat[pi] == b'*' { while pi < pat.len() && pat[pi] == b'*' {
pi += 1; pi += 1;
@ -522,7 +572,7 @@ fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
} }
return false; return false;
} else if pi < pat.len() && pat[pi] == b'*' { } else if pi < pat.len() && pat[pi] == b'*' {
// `*` match any chars except `/` // `*` -- match any chars except `/`
star_pi = pi; star_pi = pi;
star_ti = ti; star_ti = ti;
pi += 1; pi += 1;
@ -603,6 +653,38 @@ mod tests {
assert_ne!(origin_key(&http), origin_key(&https)); assert_ne!(origin_key(&http), origin_key(&https));
} }
// -- root_domain tests --
#[test]
fn root_domain_simple() {
let url = Url::parse("https://example.com/page").unwrap();
assert_eq!(root_domain(&url), "example.com");
}
#[test]
fn root_domain_subdomain() {
let url = Url::parse("https://blog.example.com/page").unwrap();
assert_eq!(root_domain(&url), "example.com");
}
#[test]
fn root_domain_deep_subdomain() {
let url = Url::parse("https://a.b.c.example.com/").unwrap();
assert_eq!(root_domain(&url), "example.com");
}
#[test]
fn root_domain_country_tld() {
let url = Url::parse("https://blog.example.co.uk/").unwrap();
assert_eq!(root_domain(&url), "example.co.uk");
}
#[test]
fn root_domain_strips_www() {
let url = Url::parse("https://www.example.com/").unwrap();
assert_eq!(root_domain(&url), "example.com");
}
// -- glob_match tests -- // -- glob_match tests --
#[test] #[test]

View file

@ -1,6 +1,6 @@
{ {
"name": "create-webclaw", "name": "create-webclaw",
"version": "0.1.3", "version": "0.1.4",
"mcpName": "io.github.0xMassi/webclaw", "mcpName": "io.github.0xMassi/webclaw",
"description": "Set up webclaw MCP server for AI agents (Claude, Cursor, Windsurf, OpenCode, Codex, Antigravity)", "description": "Set up webclaw MCP server for AI agents (Claude, Cursor, Windsurf, OpenCode, Codex, Antigravity)",
"bin": { "bin": {