mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat: add allow_subdomains and allow_external_links to CrawlConfig
Crawls are same-origin by default. Enable allow_subdomains to follow sibling/child subdomains (blog.example.com from example.com), or allow_external_links for full cross-origin crawling. Root domain extraction uses a heuristic that handles two-part TLDs (co.uk, com.au). Includes 5 unit tests for root_domain(). Bump to 0.3.12. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a4c351d5ae
commit
050b2ef463
7 changed files with 109 additions and 17 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -3,3 +3,4 @@ target/
|
||||||
.env
|
.env
|
||||||
proxies.txt
|
proxies.txt
|
||||||
.claude/skills/
|
.claude/skills/
|
||||||
|
*.json
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,13 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [0.3.12] — 2026-04-10
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **Crawl scope control**: new `allow_subdomains` and `allow_external_links` fields on `CrawlConfig`. By default crawls stay same-origin. Enable `allow_subdomains` to follow sibling/child subdomains (e.g. blog.example.com from example.com), or `allow_external_links` for full cross-origin crawling. Root domain extraction uses a heuristic that handles two-part TLDs (co.uk, com.au).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.3.11] — 2026-04-10
|
## [0.3.11] — 2026-04-10
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
|
||||||
12
Cargo.lock
generated
12
Cargo.lock
generated
|
|
@ -3102,7 +3102,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.3.11"
|
version = "0.3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3122,7 +3122,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.3.11"
|
version = "0.3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -3140,7 +3140,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.3.11"
|
version = "0.3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"calamine",
|
"calamine",
|
||||||
|
|
@ -3162,7 +3162,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.3.11"
|
version = "0.3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3175,7 +3175,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.3.11"
|
version = "0.3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dirs",
|
"dirs",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3196,7 +3196,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.3.11"
|
version = "0.3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
||||||
members = ["crates/*"]
|
members = ["crates/*"]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "0.3.11"
|
version = "0.3.12"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "AGPL-3.0"
|
license = "AGPL-3.0"
|
||||||
repository = "https://github.com/0xMassi/webclaw"
|
repository = "https://github.com/0xMassi/webclaw"
|
||||||
|
|
|
||||||
|
|
@ -1218,6 +1218,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
||||||
exclude_patterns,
|
exclude_patterns,
|
||||||
progress_tx: Some(progress_tx),
|
progress_tx: Some(progress_tx),
|
||||||
cancel_flag: Some(Arc::clone(&cancel_flag)),
|
cancel_flag: Some(Arc::clone(&cancel_flag)),
|
||||||
|
allow_subdomains: false,
|
||||||
|
allow_external_links: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Load resume state if --crawl-state file exists
|
// Load resume state if --crawl-state file exists
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,13 @@
|
||||||
/// Recursive same-origin web crawler built on top of [`FetchClient`].
|
/// Recursive web crawler built on top of [`FetchClient`].
|
||||||
///
|
///
|
||||||
/// Starts from a seed URL, extracts content, discovers links, and follows
|
/// Starts from a seed URL, extracts content, discovers links, and follows
|
||||||
/// them breadth-first up to a configurable depth/page limit. Uses a semaphore
|
/// them breadth-first up to a configurable depth/page limit. Uses a semaphore
|
||||||
/// for bounded concurrency and per-request delays for politeness.
|
/// for bounded concurrency and per-request delays for politeness.
|
||||||
///
|
///
|
||||||
|
/// Scope control: by default only same-origin links are followed. Enable
|
||||||
|
/// `allow_subdomains` to include sibling/child subdomains of the seed host,
|
||||||
|
/// or `allow_external_links` to follow links to any domain.
|
||||||
|
///
|
||||||
/// When `use_sitemap` is enabled, the crawler first discovers URLs from the
|
/// When `use_sitemap` is enabled, the crawler first discovers URLs from the
|
||||||
/// site's sitemaps and seeds the BFS frontier before crawling.
|
/// site's sitemaps and seeds the BFS frontier before crawling.
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
|
@ -39,11 +43,17 @@ pub struct CrawlConfig {
|
||||||
/// Seed BFS frontier from sitemap discovery before crawling.
|
/// Seed BFS frontier from sitemap discovery before crawling.
|
||||||
pub use_sitemap: bool,
|
pub use_sitemap: bool,
|
||||||
/// Glob patterns for paths to include. If non-empty, only matching URLs are crawled.
|
/// Glob patterns for paths to include. If non-empty, only matching URLs are crawled.
|
||||||
/// E.g. `["/api/*", "/guides/*"]` — matched against the URL path.
|
/// E.g. `["/api/*", "/guides/*"]` -- matched against the URL path.
|
||||||
pub include_patterns: Vec<String>,
|
pub include_patterns: Vec<String>,
|
||||||
/// Glob patterns for paths to exclude. Checked after include_patterns.
|
/// Glob patterns for paths to exclude. Checked after include_patterns.
|
||||||
/// E.g. `["/changelog/*", "/blog/*"]` — matching URLs are skipped.
|
/// E.g. `["/changelog/*", "/blog/*"]` -- matching URLs are skipped.
|
||||||
pub exclude_patterns: Vec<String>,
|
pub exclude_patterns: Vec<String>,
|
||||||
|
/// Follow links on subdomains of the seed domain (e.g. blog.example.com
|
||||||
|
/// when crawling example.com). Default: false (same-origin only).
|
||||||
|
pub allow_subdomains: bool,
|
||||||
|
/// Follow links to entirely different domains. Default: false.
|
||||||
|
/// When true, the crawler becomes cross-origin. Use with caution.
|
||||||
|
pub allow_external_links: bool,
|
||||||
/// Optional channel sender for streaming per-page results as they complete.
|
/// Optional channel sender for streaming per-page results as they complete.
|
||||||
/// When set, each `PageResult` is sent on this channel immediately after extraction.
|
/// When set, each `PageResult` is sent on this channel immediately after extraction.
|
||||||
pub progress_tx: Option<tokio::sync::broadcast::Sender<PageResult>>,
|
pub progress_tx: Option<tokio::sync::broadcast::Sender<PageResult>>,
|
||||||
|
|
@ -64,6 +74,8 @@ impl Default for CrawlConfig {
|
||||||
use_sitemap: false,
|
use_sitemap: false,
|
||||||
include_patterns: Vec::new(),
|
include_patterns: Vec::new(),
|
||||||
exclude_patterns: Vec::new(),
|
exclude_patterns: Vec::new(),
|
||||||
|
allow_subdomains: false,
|
||||||
|
allow_external_links: false,
|
||||||
progress_tx: None,
|
progress_tx: None,
|
||||||
cancel_flag: None,
|
cancel_flag: None,
|
||||||
}
|
}
|
||||||
|
|
@ -113,6 +125,8 @@ pub struct Crawler {
|
||||||
client: Arc<FetchClient>,
|
client: Arc<FetchClient>,
|
||||||
config: CrawlConfig,
|
config: CrawlConfig,
|
||||||
seed_origin: String,
|
seed_origin: String,
|
||||||
|
/// Root domain of the seed URL for subdomain matching (e.g. "example.com").
|
||||||
|
seed_root_domain: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Crawler {
|
impl Crawler {
|
||||||
|
|
@ -121,6 +135,7 @@ impl Crawler {
|
||||||
pub fn new(seed_url: &str, config: CrawlConfig) -> Result<Self, FetchError> {
|
pub fn new(seed_url: &str, config: CrawlConfig) -> Result<Self, FetchError> {
|
||||||
let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?;
|
let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?;
|
||||||
let seed_origin = origin_key(&seed);
|
let seed_origin = origin_key(&seed);
|
||||||
|
let seed_root_domain = root_domain(&seed);
|
||||||
|
|
||||||
let client = FetchClient::new(config.fetch.clone())?;
|
let client = FetchClient::new(config.fetch.clone())?;
|
||||||
|
|
||||||
|
|
@ -128,6 +143,7 @@ impl Crawler {
|
||||||
client: Arc::new(client),
|
client: Arc::new(client),
|
||||||
config,
|
config,
|
||||||
seed_origin,
|
seed_origin,
|
||||||
|
seed_root_domain,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -278,7 +294,7 @@ impl Crawler {
|
||||||
let delay = self.config.delay;
|
let delay = self.config.delay;
|
||||||
|
|
||||||
handles.push(tokio::spawn(async move {
|
handles.push(tokio::spawn(async move {
|
||||||
// Acquire permit — blocks if concurrency limit reached
|
// Acquire permit -- blocks if concurrency limit reached
|
||||||
let _permit = permit.acquire().await.expect("semaphore closed");
|
let _permit = permit.acquire().await.expect("semaphore closed");
|
||||||
tokio::time::sleep(delay).await;
|
tokio::time::sleep(delay).await;
|
||||||
|
|
||||||
|
|
@ -392,10 +408,21 @@ impl Crawler {
|
||||||
_ => return None,
|
_ => return None,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Same-origin check (scheme + host + port)
|
// Scope check: same-origin, subdomain, or external
|
||||||
if origin_key(&parsed) != self.seed_origin {
|
if !self.config.allow_external_links {
|
||||||
|
let link_origin = origin_key(&parsed);
|
||||||
|
if link_origin != self.seed_origin {
|
||||||
|
// Not same-origin. Check if subdomain crawling is allowed.
|
||||||
|
if self.config.allow_subdomains {
|
||||||
|
let link_root = root_domain(&parsed);
|
||||||
|
if link_root != self.seed_root_domain {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Path prefix filter
|
// Path prefix filter
|
||||||
if let Some(ref prefix) = self.config.path_prefix
|
if let Some(ref prefix) = self.config.path_prefix
|
||||||
|
|
@ -457,6 +484,29 @@ fn origin_key(url: &Url) -> String {
|
||||||
format!("{}://{}{}", url.scheme(), host, port_suffix)
|
format!("{}://{}{}", url.scheme(), host, port_suffix)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extract the root domain from a URL for subdomain comparison.
|
||||||
|
/// "blog.docs.example.com" -> "example.com", "example.co.uk" -> "example.co.uk" (best-effort).
|
||||||
|
///
|
||||||
|
/// Uses a simple heuristic: take the last two labels, or three if the second-to-last
|
||||||
|
/// is short (<=3 chars, likely a country SLD like "co.uk", "com.au").
|
||||||
|
fn root_domain(url: &Url) -> String {
|
||||||
|
let host = url.host_str().unwrap_or("");
|
||||||
|
let host = host.strip_prefix("www.").unwrap_or(host);
|
||||||
|
let labels: Vec<&str> = host.split('.').collect();
|
||||||
|
|
||||||
|
if labels.len() <= 2 {
|
||||||
|
return host.to_ascii_lowercase();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Heuristic for two-part TLDs (co.uk, com.au, org.br, etc.)
|
||||||
|
let sld = labels[labels.len() - 2];
|
||||||
|
if labels.len() >= 3 && sld.len() <= 3 {
|
||||||
|
labels[labels.len() - 3..].join(".").to_ascii_lowercase()
|
||||||
|
} else {
|
||||||
|
labels[labels.len() - 2..].join(".").to_ascii_lowercase()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
|
/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
|
||||||
/// lowercase scheme + host. Preserves query params and path case.
|
/// lowercase scheme + host. Preserves query params and path case.
|
||||||
fn normalize(url: &Url) -> String {
|
fn normalize(url: &Url) -> String {
|
||||||
|
|
@ -502,7 +552,7 @@ fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
|
||||||
|
|
||||||
while ti < text.len() {
|
while ti < text.len() {
|
||||||
if pi < pat.len() && pat[pi] == b'*' && pi + 1 < pat.len() && pat[pi + 1] == b'*' {
|
if pi < pat.len() && pat[pi] == b'*' && pi + 1 < pat.len() && pat[pi + 1] == b'*' {
|
||||||
// `**` — match everything including slashes
|
// `**` -- match everything including slashes
|
||||||
// Skip all consecutive `*`
|
// Skip all consecutive `*`
|
||||||
while pi < pat.len() && pat[pi] == b'*' {
|
while pi < pat.len() && pat[pi] == b'*' {
|
||||||
pi += 1;
|
pi += 1;
|
||||||
|
|
@ -522,7 +572,7 @@ fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
} else if pi < pat.len() && pat[pi] == b'*' {
|
} else if pi < pat.len() && pat[pi] == b'*' {
|
||||||
// `*` — match any chars except `/`
|
// `*` -- match any chars except `/`
|
||||||
star_pi = pi;
|
star_pi = pi;
|
||||||
star_ti = ti;
|
star_ti = ti;
|
||||||
pi += 1;
|
pi += 1;
|
||||||
|
|
@ -603,6 +653,38 @@ mod tests {
|
||||||
assert_ne!(origin_key(&http), origin_key(&https));
|
assert_ne!(origin_key(&http), origin_key(&https));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -- root_domain tests --
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn root_domain_simple() {
|
||||||
|
let url = Url::parse("https://example.com/page").unwrap();
|
||||||
|
assert_eq!(root_domain(&url), "example.com");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn root_domain_subdomain() {
|
||||||
|
let url = Url::parse("https://blog.example.com/page").unwrap();
|
||||||
|
assert_eq!(root_domain(&url), "example.com");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn root_domain_deep_subdomain() {
|
||||||
|
let url = Url::parse("https://a.b.c.example.com/").unwrap();
|
||||||
|
assert_eq!(root_domain(&url), "example.com");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn root_domain_country_tld() {
|
||||||
|
let url = Url::parse("https://blog.example.co.uk/").unwrap();
|
||||||
|
assert_eq!(root_domain(&url), "example.co.uk");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn root_domain_strips_www() {
|
||||||
|
let url = Url::parse("https://www.example.com/").unwrap();
|
||||||
|
assert_eq!(root_domain(&url), "example.com");
|
||||||
|
}
|
||||||
|
|
||||||
// -- glob_match tests --
|
// -- glob_match tests --
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "create-webclaw",
|
"name": "create-webclaw",
|
||||||
"version": "0.1.3",
|
"version": "0.1.4",
|
||||||
"mcpName": "io.github.0xMassi/webclaw",
|
"mcpName": "io.github.0xMassi/webclaw",
|
||||||
"description": "Set up webclaw MCP server for AI agents (Claude, Cursor, Windsurf, OpenCode, Codex, Antigravity)",
|
"description": "Set up webclaw MCP server for AI agents (Claude, Cursor, Windsurf, OpenCode, Codex, Antigravity)",
|
||||||
"bin": {
|
"bin": {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue