mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-24 15:56:23 +02:00
feat: add allow_subdomains and allow_external_links to CrawlConfig
Crawls are same-origin by default. Enable allow_subdomains to follow sibling/child subdomains (blog.example.com from example.com), or allow_external_links for full cross-origin crawling. Root domain extraction uses a heuristic that handles two-part TLDs (co.uk, com.au). Includes 5 unit tests for root_domain(). Bump to 0.3.12. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a4c351d5ae
commit
050b2ef463
7 changed files with 109 additions and 17 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -3,3 +3,4 @@ target/
|
|||
.env
|
||||
proxies.txt
|
||||
.claude/skills/
|
||||
*.json
|
||||
|
|
|
|||
|
|
@ -3,6 +3,13 @@
|
|||
All notable changes to webclaw are documented here.
|
||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||
|
||||
## [0.3.12] — 2026-04-10
|
||||
|
||||
### Added
|
||||
- **Crawl scope control**: new `allow_subdomains` and `allow_external_links` fields on `CrawlConfig`. By default crawls stay same-origin. Enable `allow_subdomains` to follow sibling/child subdomains (e.g. blog.example.com from example.com), or `allow_external_links` for full cross-origin crawling. Root domain extraction uses a heuristic that handles two-part TLDs (co.uk, com.au).
|
||||
|
||||
---
|
||||
|
||||
## [0.3.11] — 2026-04-10
|
||||
|
||||
### Added
|
||||
|
|
|
|||
12
Cargo.lock
generated
12
Cargo.lock
generated
|
|
@ -3102,7 +3102,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.3.11"
|
||||
version = "0.3.12"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
|
|
@ -3122,7 +3122,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.3.11"
|
||||
version = "0.3.12"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
|
|
@ -3140,7 +3140,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.3.11"
|
||||
version = "0.3.12"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"calamine",
|
||||
|
|
@ -3162,7 +3162,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.3.11"
|
||||
version = "0.3.12"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
|
|
@ -3175,7 +3175,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.3.11"
|
||||
version = "0.3.12"
|
||||
dependencies = [
|
||||
"dirs",
|
||||
"dotenvy",
|
||||
|
|
@ -3196,7 +3196,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.3.11"
|
||||
version = "0.3.12"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
|||
members = ["crates/*"]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.3.11"
|
||||
version = "0.3.12"
|
||||
edition = "2024"
|
||||
license = "AGPL-3.0"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
|
|
|
|||
|
|
@ -1218,6 +1218,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
|||
exclude_patterns,
|
||||
progress_tx: Some(progress_tx),
|
||||
cancel_flag: Some(Arc::clone(&cancel_flag)),
|
||||
allow_subdomains: false,
|
||||
allow_external_links: false,
|
||||
};
|
||||
|
||||
// Load resume state if --crawl-state file exists
|
||||
|
|
|
|||
|
|
@ -1,9 +1,13 @@
|
|||
/// Recursive same-origin web crawler built on top of [`FetchClient`].
|
||||
/// Recursive web crawler built on top of [`FetchClient`].
|
||||
///
|
||||
/// Starts from a seed URL, extracts content, discovers links, and follows
|
||||
/// them breadth-first up to a configurable depth/page limit. Uses a semaphore
|
||||
/// for bounded concurrency and per-request delays for politeness.
|
||||
///
|
||||
/// Scope control: by default only same-origin links are followed. Enable
|
||||
/// `allow_subdomains` to include sibling/child subdomains of the seed host,
|
||||
/// or `allow_external_links` to follow links to any domain.
|
||||
///
|
||||
/// When `use_sitemap` is enabled, the crawler first discovers URLs from the
|
||||
/// site's sitemaps and seeds the BFS frontier before crawling.
|
||||
use std::collections::HashSet;
|
||||
|
|
@ -39,11 +43,17 @@ pub struct CrawlConfig {
|
|||
/// Seed BFS frontier from sitemap discovery before crawling.
|
||||
pub use_sitemap: bool,
|
||||
/// Glob patterns for paths to include. If non-empty, only matching URLs are crawled.
|
||||
/// E.g. `["/api/*", "/guides/*"]` — matched against the URL path.
|
||||
/// E.g. `["/api/*", "/guides/*"]` -- matched against the URL path.
|
||||
pub include_patterns: Vec<String>,
|
||||
/// Glob patterns for paths to exclude. Checked after include_patterns.
|
||||
/// E.g. `["/changelog/*", "/blog/*"]` — matching URLs are skipped.
|
||||
/// E.g. `["/changelog/*", "/blog/*"]` -- matching URLs are skipped.
|
||||
pub exclude_patterns: Vec<String>,
|
||||
/// Follow links on subdomains of the seed domain (e.g. blog.example.com
|
||||
/// when crawling example.com). Default: false (same-origin only).
|
||||
pub allow_subdomains: bool,
|
||||
/// Follow links to entirely different domains. Default: false.
|
||||
/// When true, the crawler becomes cross-origin. Use with caution.
|
||||
pub allow_external_links: bool,
|
||||
/// Optional channel sender for streaming per-page results as they complete.
|
||||
/// When set, each `PageResult` is sent on this channel immediately after extraction.
|
||||
pub progress_tx: Option<tokio::sync::broadcast::Sender<PageResult>>,
|
||||
|
|
@ -64,6 +74,8 @@ impl Default for CrawlConfig {
|
|||
use_sitemap: false,
|
||||
include_patterns: Vec::new(),
|
||||
exclude_patterns: Vec::new(),
|
||||
allow_subdomains: false,
|
||||
allow_external_links: false,
|
||||
progress_tx: None,
|
||||
cancel_flag: None,
|
||||
}
|
||||
|
|
@ -113,6 +125,8 @@ pub struct Crawler {
|
|||
client: Arc<FetchClient>,
|
||||
config: CrawlConfig,
|
||||
seed_origin: String,
|
||||
/// Root domain of the seed URL for subdomain matching (e.g. "example.com").
|
||||
seed_root_domain: String,
|
||||
}
|
||||
|
||||
impl Crawler {
|
||||
|
|
@ -121,6 +135,7 @@ impl Crawler {
|
|||
pub fn new(seed_url: &str, config: CrawlConfig) -> Result<Self, FetchError> {
|
||||
let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?;
|
||||
let seed_origin = origin_key(&seed);
|
||||
let seed_root_domain = root_domain(&seed);
|
||||
|
||||
let client = FetchClient::new(config.fetch.clone())?;
|
||||
|
||||
|
|
@ -128,6 +143,7 @@ impl Crawler {
|
|||
client: Arc::new(client),
|
||||
config,
|
||||
seed_origin,
|
||||
seed_root_domain,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -278,7 +294,7 @@ impl Crawler {
|
|||
let delay = self.config.delay;
|
||||
|
||||
handles.push(tokio::spawn(async move {
|
||||
// Acquire permit — blocks if concurrency limit reached
|
||||
// Acquire permit -- blocks if concurrency limit reached
|
||||
let _permit = permit.acquire().await.expect("semaphore closed");
|
||||
tokio::time::sleep(delay).await;
|
||||
|
||||
|
|
@ -392,9 +408,20 @@ impl Crawler {
|
|||
_ => return None,
|
||||
}
|
||||
|
||||
// Same-origin check (scheme + host + port)
|
||||
if origin_key(&parsed) != self.seed_origin {
|
||||
return None;
|
||||
// Scope check: same-origin, subdomain, or external
|
||||
if !self.config.allow_external_links {
|
||||
let link_origin = origin_key(&parsed);
|
||||
if link_origin != self.seed_origin {
|
||||
// Not same-origin. Check if subdomain crawling is allowed.
|
||||
if self.config.allow_subdomains {
|
||||
let link_root = root_domain(&parsed);
|
||||
if link_root != self.seed_root_domain {
|
||||
return None;
|
||||
}
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Path prefix filter
|
||||
|
|
@ -457,6 +484,29 @@ fn origin_key(url: &Url) -> String {
|
|||
format!("{}://{}{}", url.scheme(), host, port_suffix)
|
||||
}
|
||||
|
||||
/// Extract the root domain from a URL for subdomain comparison.
|
||||
/// "blog.docs.example.com" -> "example.com", "example.co.uk" -> "example.co.uk" (best-effort).
|
||||
///
|
||||
/// Uses a simple heuristic: take the last two labels, or three if the second-to-last
|
||||
/// is short (<=3 chars, likely a country SLD like "co.uk", "com.au").
|
||||
fn root_domain(url: &Url) -> String {
|
||||
let host = url.host_str().unwrap_or("");
|
||||
let host = host.strip_prefix("www.").unwrap_or(host);
|
||||
let labels: Vec<&str> = host.split('.').collect();
|
||||
|
||||
if labels.len() <= 2 {
|
||||
return host.to_ascii_lowercase();
|
||||
}
|
||||
|
||||
// Heuristic for two-part TLDs (co.uk, com.au, org.br, etc.)
|
||||
let sld = labels[labels.len() - 2];
|
||||
if labels.len() >= 3 && sld.len() <= 3 {
|
||||
labels[labels.len() - 3..].join(".").to_ascii_lowercase()
|
||||
} else {
|
||||
labels[labels.len() - 2..].join(".").to_ascii_lowercase()
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
|
||||
/// lowercase scheme + host. Preserves query params and path case.
|
||||
fn normalize(url: &Url) -> String {
|
||||
|
|
@ -502,7 +552,7 @@ fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
|
|||
|
||||
while ti < text.len() {
|
||||
if pi < pat.len() && pat[pi] == b'*' && pi + 1 < pat.len() && pat[pi + 1] == b'*' {
|
||||
// `**` — match everything including slashes
|
||||
// `**` -- match everything including slashes
|
||||
// Skip all consecutive `*`
|
||||
while pi < pat.len() && pat[pi] == b'*' {
|
||||
pi += 1;
|
||||
|
|
@ -522,7 +572,7 @@ fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
|
|||
}
|
||||
return false;
|
||||
} else if pi < pat.len() && pat[pi] == b'*' {
|
||||
// `*` — match any chars except `/`
|
||||
// `*` -- match any chars except `/`
|
||||
star_pi = pi;
|
||||
star_ti = ti;
|
||||
pi += 1;
|
||||
|
|
@ -603,6 +653,38 @@ mod tests {
|
|||
assert_ne!(origin_key(&http), origin_key(&https));
|
||||
}
|
||||
|
||||
// -- root_domain tests --
|
||||
|
||||
#[test]
|
||||
fn root_domain_simple() {
|
||||
let url = Url::parse("https://example.com/page").unwrap();
|
||||
assert_eq!(root_domain(&url), "example.com");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn root_domain_subdomain() {
|
||||
let url = Url::parse("https://blog.example.com/page").unwrap();
|
||||
assert_eq!(root_domain(&url), "example.com");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn root_domain_deep_subdomain() {
|
||||
let url = Url::parse("https://a.b.c.example.com/").unwrap();
|
||||
assert_eq!(root_domain(&url), "example.com");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn root_domain_country_tld() {
|
||||
let url = Url::parse("https://blog.example.co.uk/").unwrap();
|
||||
assert_eq!(root_domain(&url), "example.co.uk");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn root_domain_strips_www() {
|
||||
let url = Url::parse("https://www.example.com/").unwrap();
|
||||
assert_eq!(root_domain(&url), "example.com");
|
||||
}
|
||||
|
||||
// -- glob_match tests --
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "create-webclaw",
|
||||
"version": "0.1.3",
|
||||
"version": "0.1.4",
|
||||
"mcpName": "io.github.0xMassi/webclaw",
|
||||
"description": "Set up webclaw MCP server for AI agents (Claude, Cursor, Windsurf, OpenCode, Codex, Antigravity)",
|
||||
"bin": {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue