webclaw/crates/webclaw-fetch/src/crawler.rs

/// Recursive web crawler built on top of [`FetchClient`].
///
/// Starts from a seed URL, extracts content, discovers links, and follows
/// them breadth-first up to a configurable depth/page limit. Uses a semaphore
/// for bounded concurrency and per-request delays for politeness.
///
/// Scope control: by default only same-origin links are followed. Enable
/// `allow_subdomains` to include sibling/child subdomains of the seed host,
/// or `allow_external_links` to follow links to any domain.
///
/// When `use_sitemap` is enabled, the crawler first discovers URLs from the
/// site's sitemaps and seeds the BFS frontier before crawling.
use std::collections::HashSet;
use std::path::Path;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::{Duration, Instant};

use serde::{Deserialize, Serialize};
use tokio::sync::Semaphore;
use tracing::{debug, info, warn};
use url::Url;

use crate::client::{FetchClient, FetchConfig};
use crate::error::FetchError;
use crate::sitemap;

/// Controls crawl scope, depth, concurrency, and politeness.
#[derive(Debug, Clone)]
pub struct CrawlConfig {
    /// Fetch configuration (browser profile, proxy, timeout, etc.)
    pub fetch: FetchConfig,
    /// How deep to follow links. 1 = only immediate links from seed page.
    pub max_depth: usize,
    /// Hard cap on total pages fetched (including the seed).
    pub max_pages: usize,
    /// Max concurrent in-flight requests.
    pub concurrency: usize,
    /// Minimum delay before each request (politeness).
    pub delay: Duration,
    /// Only follow URLs whose path starts with this prefix (e.g. "/docs/").
    pub path_prefix: Option<String>,
    /// Seed BFS frontier from sitemap discovery before crawling.
    pub use_sitemap: bool,
    /// Glob patterns for paths to include. If non-empty, only matching URLs are crawled.
    /// E.g. `["/api/*", "/guides/*"]` -- matched against the URL path.
    pub include_patterns: Vec<String>,
    /// Glob patterns for paths to exclude. Checked after include_patterns.
    /// E.g. `["/changelog/*", "/blog/*"]` -- matching URLs are skipped.
    pub exclude_patterns: Vec<String>,
    /// Follow links on subdomains of the seed domain (e.g. blog.example.com
    /// when crawling example.com). Default: false (same-origin only).
    pub allow_subdomains: bool,
    /// Follow links to entirely different domains. Default: false.
    /// When true, the crawler becomes cross-origin. Use with caution.
    pub allow_external_links: bool,
    /// Optional channel sender for streaming per-page results as they complete.
    /// When set, each `PageResult` is sent on this channel immediately after extraction.
    pub progress_tx: Option<tokio::sync::broadcast::Sender<PageResult>>,
    /// When set to `true`, the crawler breaks out of the main loop early.
    /// Callers (e.g. a Ctrl+C handler) can flip this to request graceful cancellation.
    pub cancel_flag: Option<Arc<AtomicBool>>,
}

impl Default for CrawlConfig {
    fn default() -> Self {
        Self {
            fetch: FetchConfig::default(),
            max_depth: 1,
            max_pages: 50,
            concurrency: 5,
            delay: Duration::from_millis(100),
            path_prefix: None,
            use_sitemap: false,
            include_patterns: Vec::new(),
            exclude_patterns: Vec::new(),
            allow_subdomains: false,
            allow_external_links: false,
            progress_tx: None,
            cancel_flag: None,
        }
    }
}

/// Aggregated results from a crawl run.
#[derive(Debug, Serialize, Deserialize)]
pub struct CrawlResult {
    pub pages: Vec<PageResult>,
    pub total: usize,
    pub ok: usize,
    pub errors: usize,
    pub elapsed_secs: f64,
    /// URLs visited during this crawl (for resume state).
    #[serde(skip)]
    pub visited: HashSet<String>,
    /// Remaining frontier when crawl was cancelled (for resume state).
    #[serde(skip)]
    pub remaining_frontier: Vec<(String, usize)>,
}

/// Outcome of extracting a single page during the crawl.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageResult {
    pub url: String,
    pub depth: usize,
    pub extraction: Option<webclaw_core::ExtractionResult>,
    pub error: Option<String>,
    #[serde(skip)]
    pub elapsed: Duration,
}

/// Serializable crawl state for resume after Ctrl+C cancellation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlState {
    pub seed_url: String,
    pub visited: Vec<String>,
    pub frontier: Vec<(String, usize)>,
    pub completed_pages: usize,
    pub max_pages: usize,
    pub max_depth: usize,
}

/// Recursive crawler that wraps a shared [`FetchClient`].
pub struct Crawler {
    client: Arc<FetchClient>,
    config: CrawlConfig,
    seed_origin: String,
    /// Root domain of the seed URL for subdomain matching (e.g. "example.com").
    seed_root_domain: String,
}

impl Crawler {
    /// Build a new crawler from a seed URL and config.
    /// Constructs the underlying `FetchClient` from `config.fetch`.
    pub fn new(seed_url: &str, config: CrawlConfig) -> Result<Self, FetchError> {
        let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?;
        let seed_origin = origin_key(&seed);
        let seed_root_domain = root_domain(&seed);

        // Reject pathological user-supplied glob patterns before they can
        // exercise the recursive `**` handler in glob_match_inner. The
        // matcher is a straight backtracking implementation; a deeply
        // nested `**/**/**/...` pattern against a long path can degrade
        // to exponential time per link checked, per page crawled.
        for pat in config
            .include_patterns
            .iter()
            .chain(config.exclude_patterns.iter())
        {
            validate_glob(pat)?;
        }

        let client = FetchClient::new(config.fetch.clone())?;

        Ok(Self {
            client: Arc::new(client),
            config,
            seed_origin,
            seed_root_domain,
        })
    }

    /// Save current crawl state to a JSON file for later resume.
    pub fn save_state(
        path: &Path,
        seed_url: &str,
        visited: &HashSet<String>,
        frontier: &[(String, usize)],
        completed_pages: usize,
        max_pages: usize,
        max_depth: usize,
    ) -> Result<(), String> {
        let state = CrawlState {
            seed_url: seed_url.to_string(),
            visited: visited.iter().cloned().collect(),
            frontier: frontier.to_vec(),
            completed_pages,
            max_pages,
            max_depth,
        };
        let json =
            serde_json::to_string_pretty(&state).map_err(|e| format!("serialize state: {e}"))?;
        std::fs::write(path, json).map_err(|e| format!("write state to {}: {e}", path.display()))
    }

    /// Load crawl state from a JSON file. Returns `None` if file doesn't exist.
    pub fn load_state(path: &Path) -> Option<CrawlState> {
        let content = std::fs::read_to_string(path).ok()?;
        serde_json::from_str(&content).ok()
    }

    /// Returns true if the cancel flag has been set.
    ///
    /// Uses `Acquire` load to pair with a `Release` store on the cancel
    /// path. `Relaxed` was technically fine in practice (x86/arm64 give
    /// release semantics for free on single-word stores) but `Acquire`
    /// makes the ordering explicit so the compiler and future readers
    /// don't need to reason about the memory model.
    fn is_cancelled(&self) -> bool {
        self.config
            .cancel_flag
            .as_ref()
            .is_some_and(|f| f.load(Ordering::Acquire))
    }

    /// Crawl starting from `start_url`, returning results for every page visited.
    ///
    /// Uses breadth-first traversal: all pages at depth N are fetched (concurrently,
    /// bounded by `config.concurrency`) before moving to depth N+1.
    ///
    /// When `config.use_sitemap` is true, sitemap URLs are discovered first and
    /// added to the initial frontier at depth 0 alongside the seed URL.
    ///
    /// If `resume_state` is provided, the crawl resumes from the saved state
    /// (pre-populated visited set and frontier) instead of starting fresh.
    pub async fn crawl(&self, start_url: &str, resume_state: Option<CrawlState>) -> CrawlResult {
        let start = Instant::now();

        let seed = match Url::parse(start_url) {
            Ok(u) => u,
            Err(_) => {
                return CrawlResult {
                    pages: vec![PageResult {
                        url: start_url.to_string(),
                        depth: 0,
                        extraction: None,
                        error: Some(format!("invalid URL: {start_url}")),
                        elapsed: Duration::ZERO,
                    }],
                    total: 1,
                    ok: 0,
                    errors: 1,
                    elapsed_secs: 0.0,
                    visited: HashSet::new(),
                    remaining_frontier: Vec::new(),
                };
            }
        };

        let semaphore = Arc::new(Semaphore::new(self.config.concurrency));
        let mut visited: HashSet<String>;
        let mut pages: Vec<PageResult> = Vec::new();
        let mut frontier: Vec<(String, usize)>;

        // Resume from saved state or start fresh
        if let Some(state) = resume_state {
            visited = state.visited.into_iter().collect();
            frontier = state.frontier;
            info!(
                visited = visited.len(),
                frontier = frontier.len(),
                "resuming crawl from saved state"
            );
        } else {
            visited = HashSet::new();
            frontier = vec![(normalize(&seed), 0)];

            // Seed frontier from sitemap if enabled
            if self.config.use_sitemap {
                let base_url = format!("{}://{}", seed.scheme(), seed.host_str().unwrap_or(""));
                match sitemap::discover(&self.client, &base_url).await {
                    Ok(entries) => {
                        let before = frontier.len();
                        for entry in entries {
                            if self.qualify_link(&entry.url, &visited).is_some() {
                                let parsed = match Url::parse(&entry.url) {
                                    Ok(u) => u,
                                    Err(_) => continue,
                                };
                                let norm = normalize(&parsed);
                                frontier.push((norm, 0));
                            }
                        }
                        let added = frontier.len() - before;
                        info!(
                            sitemap_urls = added,
                            "seeded frontier from sitemap discovery"
                        );
                    }
                    Err(e) => {
                        warn!(error = %e, "sitemap discovery failed, continuing with seed URL only");
                    }
                }
            }
        }

        while !frontier.is_empty() && pages.len() < self.config.max_pages {
            // Check cancel flag before processing each batch
            if self.is_cancelled() {
                info!("crawl cancelled by user");
                break;
            }

            // Dedup this level's frontier against the visited set and page cap
            let batch: Vec<(String, usize)> = frontier
                .drain(..)
                .filter(|(url, _)| visited.insert(url.clone()))
                .take(self.config.max_pages.saturating_sub(pages.len()))
                .collect();

            if batch.is_empty() {
                break;
            }

            // Spawn one task per URL, bounded by semaphore
            let mut handles = Vec::with_capacity(batch.len());

            for (url, depth) in &batch {
                let permit = Arc::clone(&semaphore);
                let client = Arc::clone(&self.client);
                let url = url.clone();
                let depth = *depth;
                let delay = self.config.delay;

                handles.push(tokio::spawn(async move {
                    // Acquire permit -- blocks if concurrency limit reached.
                    // Surface semaphore-closed as a failed PageResult rather
                    // than panicking the spawned task and silently dropping
                    // it from the batch.
                    let page_start = Instant::now();
                    let result = match permit.acquire().await {
                        Ok(_permit) => {
                            tokio::time::sleep(delay).await;
                            client.fetch_and_extract(&url).await
                        }
                        Err(_) => {
                            warn!(url = %url, depth, "semaphore closed before acquire");
                            return PageResult {
                                url,
                                depth,
                                extraction: None,
                                error: Some("semaphore closed before acquire".into()),
                                elapsed: page_start.elapsed(),
                            };
                        }
                    };
                    let elapsed = page_start.elapsed();

                    match result {
                        Ok(extraction) => {
                            debug!(
                                url = %url, depth,
                                elapsed_ms = %elapsed.as_millis(),
                                "page extracted"
                            );
                            PageResult {
                                url,
                                depth,
                                extraction: Some(extraction),
                                error: None,
                                elapsed,
                            }
                        }
                        Err(e) => {
                            warn!(url = %url, depth, error = %e, "page failed");
                            PageResult {
                                url,
                                depth,
                                extraction: None,
                                error: Some(e.to_string()),
                                elapsed,
                            }
                        }
                    }
                }));
            }

            // Collect results and harvest links for the next depth level
            let mut next_frontier: Vec<(String, usize)> = Vec::new();

            for handle in handles {
                let page = match handle.await {
                    Ok(page) => page,
                    Err(e) => {
                        warn!(error = %e, "crawl task panicked");
                        continue;
                    }
                };
                let depth = page.depth;

                if depth < self.config.max_depth
                    && let Some(ref extraction) = page.extraction
                {
                    for link in &extraction.content.links {
                        if let Some(candidate) = self.qualify_link(&link.href, &visited) {
                            next_frontier.push((candidate, depth + 1));
                        }
                    }
                }

                // Stream progress if a channel is configured
                if let Some(tx) = &self.config.progress_tx {
                    let _ = tx.send(page.clone());
                }

                pages.push(page);

                if pages.len() >= self.config.max_pages {
                    break;
                }

                // Check cancel flag between page results
                if self.is_cancelled() {
                    info!("crawl cancelled by user (mid-batch)");
                    break;
                }
            }

            // Cap frontier size independently of max_pages. Pages like
            // search-result listings or tag clouds can emit thousands of
            // links per page; without this a single dense page could push
            // the frontier into the tens of thousands of entries and keep
            // String allocations alive even after max_pages halts crawling.
            // Trim aggressively once we exceed 10× max_pages, keeping the
            // most recently discovered entries which are still on-topic
            // (breadth-first = siblings of the last page we saw).
            let frontier_cap = self.config.max_pages.saturating_mul(10).max(100);
            if next_frontier.len() > frontier_cap {
                let keep = self.config.max_pages.saturating_mul(5).max(50);
                warn!(
                    frontier = next_frontier.len(),
                    cap = frontier_cap,
                    trimmed_to = keep,
                    "frontier exceeded cap, truncating"
                );
                next_frontier.truncate(keep);
            }

            frontier = next_frontier;
        }

        let total_elapsed = start.elapsed();
        let ok_count = pages.iter().filter(|p| p.extraction.is_some()).count();
        let err_count = pages.len() - ok_count;
        info!(
            total = pages.len(),
            ok = ok_count,
            errors = err_count,
            elapsed_ms = %total_elapsed.as_millis(),
            "crawl complete"
        );

        CrawlResult {
            total: pages.len(),
            ok: ok_count,
            errors: err_count,
            elapsed_secs: total_elapsed.as_secs_f64(),
            remaining_frontier: frontier,
            visited,
            pages,
        }
    }

    /// Check if a discovered link should be added to the frontier.
    /// Returns `Some(normalized_url)` if it passes all filters, `None` otherwise.
    fn qualify_link(&self, href: &str, visited: &HashSet<String>) -> Option<String> {
        let parsed = Url::parse(href).ok()?;

        // Only http(s) schemes
        match parsed.scheme() {
            "http" | "https" => {}
            _ => return None,
        }

        // Scope check: same-origin, subdomain, or external
        if !self.config.allow_external_links {
            let link_origin = origin_key(&parsed);
            if link_origin != self.seed_origin {
                // Not same-origin. Check if subdomain crawling is allowed.
                if self.config.allow_subdomains {
                    let link_root = root_domain(&parsed);
                    if link_root != self.seed_root_domain {
                        return None;
                    }
                } else {
                    return None;
                }
            }
        }

        // Path prefix filter
        if let Some(ref prefix) = self.config.path_prefix
            && !parsed.path().starts_with(prefix.as_str())
        {
            return None;
        }

        // Include patterns: if any are set, path must match at least one
        let path = parsed.path();
        if !self.config.include_patterns.is_empty()
            && !self
                .config
                .include_patterns
                .iter()
                .any(|pat| glob_match(pat, path))
        {
            return None;
        }

        // Exclude patterns: if path matches any, skip
        if self
            .config
            .exclude_patterns
            .iter()
            .any(|pat| glob_match(pat, path))
        {
            return None;
        }

        // Skip common non-page file extensions
        const SKIP_EXTENSIONS: &[&str] = &[
            ".pdf", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico", ".css", ".js",
            ".zip", ".tar", ".gz", ".xml", ".rss", ".mp3", ".mp4", ".avi", ".mov", ".woff",
            ".woff2", ".ttf", ".eot",
        ];
        if SKIP_EXTENSIONS.iter().any(|ext| path.ends_with(ext)) {
            return None;
        }

        let normalized = normalize(&parsed);

        if visited.contains(&normalized) {
            return None;
        }

        Some(normalized)
    }
}

/// Canonical origin string for comparing same-origin: "scheme://host[:port]".
fn origin_key(url: &Url) -> String {
    let port_suffix = match url.port() {
        Some(p) => format!(":{p}"),
        None => String::new(),
    };
    let host = url.host_str().unwrap_or("");
    let host = host.strip_prefix("www.").unwrap_or(host);
    format!("{}://{}{}", url.scheme(), host, port_suffix)
}

/// Extract the root domain from a URL for subdomain comparison.
/// "blog.docs.example.com" -> "example.com", "example.co.uk" -> "example.co.uk" (best-effort).
///
/// Uses a simple heuristic: take the last two labels, or three if the second-to-last
/// is short (<=3 chars, likely a country SLD like "co.uk", "com.au").
fn root_domain(url: &Url) -> String {
    let host = url.host_str().unwrap_or("");
    let host = host.strip_prefix("www.").unwrap_or(host);
    let labels: Vec<&str> = host.split('.').collect();

    if labels.len() <= 2 {
        return host.to_ascii_lowercase();
    }

    // Heuristic for two-part TLDs (co.uk, com.au, org.br, etc.)
    let sld = labels[labels.len() - 2];
    if labels.len() >= 3 && sld.len() <= 3 {
        labels[labels.len() - 3..].join(".").to_ascii_lowercase()
    } else {
        labels[labels.len() - 2..].join(".").to_ascii_lowercase()
    }
}

/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
/// lowercase scheme + host. Preserves query params and path case.
fn normalize(url: &Url) -> String {
    let scheme = url.scheme();
    let host = url.host_str().unwrap_or("").to_ascii_lowercase();
    let port_suffix = match url.port() {
        Some(p) => format!(":{p}"),
        None => String::new(),
    };

    let mut path = url.path().to_string();
    if path.len() > 1 && path.ends_with('/') {
        path.pop();
    }

    let query = match url.query() {
        Some(q) => format!("?{q}"),
        None => String::new(),
    };

    // Fragment intentionally omitted
    format!("{scheme}://{host}{port_suffix}{path}{query}")
}

/// Maximum number of `**` wildcards allowed in a single user glob. Each
/// additional `**` multiplies the backtracking fan-out of `glob_match_inner`
/// against adversarial paths; 4 is a practical ceiling for legitimate
/// nested include/exclude patterns and still keeps the matcher linear-ish.
const MAX_GLOB_DOUBLESTAR: usize = 4;

/// Maximum glob pattern length. Keeps a single pattern from taking
/// megabytes of RAM if someone copy-pastes garbage into --include.
const MAX_GLOB_LEN: usize = 1024;

/// Validate a user-supplied glob pattern before it hits the matcher.
/// Rejects patterns that would drive `glob_match_inner` into pathological
/// backtracking (too many `**`, excessive length).
fn validate_glob(pat: &str) -> Result<(), FetchError> {
    if pat.len() > MAX_GLOB_LEN {
        return Err(FetchError::Build(format!(
            "glob pattern exceeds {MAX_GLOB_LEN} chars ({} given)",
            pat.len()
        )));
    }
    // Count non-overlapping occurrences of `**`.
    let bytes = pat.as_bytes();
    let mut count = 0usize;
    let mut i = 0;
    while i + 1 < bytes.len() {
        if bytes[i] == b'*' && bytes[i + 1] == b'*' {
            count += 1;
            // Skip run of consecutive `*` so `***` counts as one.
            while i < bytes.len() && bytes[i] == b'*' {
                i += 1;
            }
        } else {
            i += 1;
        }
    }
    if count > MAX_GLOB_DOUBLESTAR {
        return Err(FetchError::Build(format!(
            "glob pattern has {count} `**` wildcards (max {MAX_GLOB_DOUBLESTAR})"
        )));
    }
    Ok(())
}

/// Simple glob matching for URL paths. Supports:
/// - `*` matches any characters within a single path segment (no `/`)
/// - `**` matches any characters including `/` (any number of segments)
/// - Literal characters match exactly
///
/// Examples:
/// - `/api/*` matches `/api/users` but not `/api/users/123`
/// - `/api/**` matches `/api/users`, `/api/users/123`, `/api/a/b/c`
/// - `/docs/*/intro` matches `/docs/v2/intro`
fn glob_match(pattern: &str, path: &str) -> bool {
    glob_match_inner(pattern.as_bytes(), path.as_bytes())
}

fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
    let mut pi = 0;
    let mut ti = 0;
    let mut star_pi = usize::MAX;
    let mut star_ti = 0;

    while ti < text.len() {
        if pi < pat.len() && pat[pi] == b'*' && pi + 1 < pat.len() && pat[pi + 1] == b'*' {
            // `**` -- match everything including slashes
            // Skip all consecutive `*`
            while pi < pat.len() && pat[pi] == b'*' {
                pi += 1;
            }
            // Skip trailing `/` after `**`
            if pi < pat.len() && pat[pi] == b'/' {
                pi += 1;
            }
            if pi >= pat.len() {
                return true; // `**` at end matches everything
            }
            // Try matching the rest of pattern against every suffix of text
            for start in ti..=text.len() {
                if glob_match_inner(&pat[pi..], &text[start..]) {
                    return true;
                }
            }
            return false;
        } else if pi < pat.len() && pat[pi] == b'*' {
            // `*` -- match any chars except `/`
            star_pi = pi;
            star_ti = ti;
            pi += 1;
        } else if pi < pat.len() && (pat[pi] == text[ti] || pat[pi] == b'?') {
            pi += 1;
            ti += 1;
        } else if star_pi != usize::MAX {
            // Backtrack: `*` absorbs one more char (but not `/`)
            if text[star_ti] == b'/' {
                return false;
            }
            star_ti += 1;
            ti = star_ti;
            pi = star_pi + 1;
        } else {
            return false;
        }
    }

    // Consume trailing `*` or `**` in pattern
    while pi < pat.len() && pat[pi] == b'*' {
        pi += 1;
    }

    pi >= pat.len()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn normalize_strips_fragment() {
        let url = Url::parse("https://example.com/page#section").unwrap();
        assert_eq!(normalize(&url), "https://example.com/page");
    }

    #[test]
    fn normalize_strips_trailing_slash() {
        let url = Url::parse("https://example.com/docs/").unwrap();
        assert_eq!(normalize(&url), "https://example.com/docs");
    }

    #[test]
    fn normalize_keeps_root_slash() {
        let url = Url::parse("https://example.com/").unwrap();
        assert_eq!(normalize(&url), "https://example.com/");
    }

    #[test]
    fn normalize_preserves_query() {
        let url = Url::parse("https://example.com/search?q=rust&page=2").unwrap();
        assert_eq!(normalize(&url), "https://example.com/search?q=rust&page=2");
    }

    #[test]
    fn normalize_lowercases_host() {
        let url = Url::parse("https://Example.COM/Path").unwrap();
        assert_eq!(normalize(&url), "https://example.com/Path");
    }

    #[test]
    fn origin_includes_explicit_port() {
        let url = Url::parse("https://example.com:8443/foo").unwrap();
        assert_eq!(origin_key(&url), "https://example.com:8443");
    }

    #[test]
    fn origin_omits_default_port() {
        let url = Url::parse("https://example.com/foo").unwrap();
        assert_eq!(origin_key(&url), "https://example.com");
    }

    #[test]
    fn different_schemes_are_different_origins() {
        let http = Url::parse("http://example.com/").unwrap();
        let https = Url::parse("https://example.com/").unwrap();
        assert_ne!(origin_key(&http), origin_key(&https));
    }

    // -- root_domain tests --

    #[test]
    fn root_domain_simple() {
        let url = Url::parse("https://example.com/page").unwrap();
        assert_eq!(root_domain(&url), "example.com");
    }

    #[test]
    fn root_domain_subdomain() {
        let url = Url::parse("https://blog.example.com/page").unwrap();
        assert_eq!(root_domain(&url), "example.com");
    }

    #[test]
    fn root_domain_deep_subdomain() {
        let url = Url::parse("https://a.b.c.example.com/").unwrap();
        assert_eq!(root_domain(&url), "example.com");
    }

    #[test]
    fn root_domain_country_tld() {
        let url = Url::parse("https://blog.example.co.uk/").unwrap();
        assert_eq!(root_domain(&url), "example.co.uk");
    }

    #[test]
    fn root_domain_strips_www() {
        let url = Url::parse("https://www.example.com/").unwrap();
        assert_eq!(root_domain(&url), "example.com");
    }

    // -- validate_glob tests --

    #[test]
    fn validate_glob_accepts_reasonable_patterns() {
        assert!(validate_glob("/api/*").is_ok());
        assert!(validate_glob("/api/**").is_ok());
        assert!(validate_glob("/docs/**/page-*.html").is_ok());
        assert!(validate_glob("/a/**/b/**/c/**/d/**").is_ok());
    }

    #[test]
    fn validate_glob_rejects_too_many_doublestars() {
        // 5 `**` exceeds MAX_GLOB_DOUBLESTAR = 4.
        let pat = "/a/**/b/**/c/**/d/**/e/**";
        let err = validate_glob(pat).unwrap_err();
        assert!(matches!(err, FetchError::Build(ref m) if m.contains("`**` wildcards")));
    }

    #[test]
    fn validate_glob_treats_triple_star_as_one() {
        // `***` is still one run, should not count as 2.
        assert!(validate_glob("/a/***/b/***/c/***/d/***").is_ok());
    }

    #[test]
    fn validate_glob_rejects_oversized_pattern() {
        let giant = "x".repeat(2048);
        let err = validate_glob(&giant).unwrap_err();
        assert!(matches!(err, FetchError::Build(ref m) if m.contains("exceeds")));
    }

    // -- glob_match tests --

    #[test]
    fn glob_star_matches_single_segment() {
        assert!(glob_match("/api/*", "/api/users"));
        assert!(glob_match("/api/*", "/api/products"));
        assert!(!glob_match("/api/*", "/api/users/123"));
    }

    #[test]
    fn glob_doublestar_matches_multiple_segments() {
        assert!(glob_match("/api/**", "/api/users"));
        assert!(glob_match("/api/**", "/api/users/123"));
        assert!(glob_match("/api/**", "/api/a/b/c/d"));
        assert!(!glob_match("/api/**", "/docs/intro"));
    }

    #[test]
    fn glob_exact_match() {
        assert!(glob_match("/about", "/about"));
        assert!(!glob_match("/about", "/about/team"));
    }

    #[test]
    fn glob_middle_wildcard() {
        assert!(glob_match("/docs/*/intro", "/docs/v2/intro"));
        assert!(!glob_match("/docs/*/intro", "/docs/v2/v3/intro"));
    }

    #[test]
    fn glob_no_pattern_matches_nothing() {
        // Empty pattern only matches empty string
        assert!(glob_match("", ""));
        assert!(!glob_match("", "/foo"));
    }

    #[test]
    fn glob_trailing_star() {
        assert!(glob_match("/blog*", "/blog"));
        assert!(glob_match("/blog*", "/blog-post"));
        assert!(!glob_match("/blog*", "/blog/post")); // * doesn't cross /
    }
}
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								/// Recursive web crawler built on top of [`FetchClient`].
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								///
 								/// Starts from a seed URL, extracts content, discovers links, and follows
 								/// them breadth-first up to a configurable depth/page limit. Uses a semaphore
 								/// for bounded concurrency and per-request delays for politeness.
 								///
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								/// Scope control: by default only same-origin links are followed. Enable
 								/// `allow_subdomains` to include sibling/child subdomains of the seed host,
 								/// or `allow_external_links` to follow links to any domain.
 								///
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								/// When `use_sitemap` is enabled, the crawler first discovers URLs from the
 								/// site's sitemaps and seeds the BFS frontier before crawling.
 								use std::collections::HashSet;
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								use std::path::Path;
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								use std::sync::Arc;
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								use std::sync::atomic::{AtomicBool, Ordering};
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								use std::time::{Duration, Instant};
 								use serde::{Deserialize, Serialize};
 								use tokio::sync::Semaphore;
 								use tracing::{debug, info, warn};
 								use url::Url;
 								use crate::client::{FetchClient, FetchConfig};
 								use crate::error::FetchError;
 								use crate::sitemap;
 								/// Controls crawl scope, depth, concurrency, and politeness.
 								#[derive(Debug, Clone)]
 								pub struct CrawlConfig {
 								    /// Fetch configuration (browser profile, proxy, timeout, etc.)
 								    pub fetch: FetchConfig,
 								    /// How deep to follow links. 1 = only immediate links from seed page.
 								    pub max_depth: usize,
 								    /// Hard cap on total pages fetched (including the seed).
 								    pub max_pages: usize,
 								    /// Max concurrent in-flight requests.
 								    pub concurrency: usize,
 								    /// Minimum delay before each request (politeness).
 								    pub delay: Duration,
 								    /// Only follow URLs whose path starts with this prefix (e.g. "/docs/").
 								    pub path_prefix: Option<String>,
 								    /// Seed BFS frontier from sitemap discovery before crawling.
 								    pub use_sitemap: bool,
 								    /// Glob patterns for paths to include. If non-empty, only matching URLs are crawled.
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								    /// E.g. `["/api/*", "/guides/*"]` -- matched against the URL path.
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								    pub include_patterns: Vec<String>,
 								    /// Glob patterns for paths to exclude. Checked after include_patterns.
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								    /// E.g. `["/changelog/*", "/blog/*"]` -- matching URLs are skipped.
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								    pub exclude_patterns: Vec<String>,
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								    /// Follow links on subdomains of the seed domain (e.g. blog.example.com
 								    /// when crawling example.com). Default: false (same-origin only).
 								    pub allow_subdomains: bool,
 								    /// Follow links to entirely different domains. Default: false.
 								    /// When true, the crawler becomes cross-origin. Use with caution.
 								    pub allow_external_links: bool,
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								    /// Optional channel sender for streaming per-page results as they complete.
 								    /// When set, each `PageResult` is sent on this channel immediately after extraction.
 								    pub progress_tx: Option<tokio::sync::broadcast::Sender<PageResult>>,
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								    /// When set to `true`, the crawler breaks out of the main loop early.
 								    /// Callers (e.g. a Ctrl+C handler) can flip this to request graceful cancellation.
 								    pub cancel_flag: Option<Arc<AtomicBool>>,
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								}
 								impl Default for CrawlConfig {
 								    fn default() -> Self {
 								        Self {
 								            fetch: FetchConfig::default(),
 								            max_depth: 1,
 								            max_pages: 50,
 								            concurrency: 5,
 								            delay: Duration::from_millis(100),
 								            path_prefix: None,
 								            use_sitemap: false,
 								            include_patterns: Vec::new(),
 								            exclude_patterns: Vec::new(),
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								            allow_subdomains: false,
 								            allow_external_links: false,
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								            progress_tx: None,
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								            cancel_flag: None,
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								        }
 								    }
 								}
 								/// Aggregated results from a crawl run.
 								#[derive(Debug, Serialize, Deserialize)]
 								pub struct CrawlResult {
 								    pub pages: Vec<PageResult>,
 								    pub total: usize,
 								    pub ok: usize,
 								    pub errors: usize,
 								    pub elapsed_secs: f64,
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								    /// URLs visited during this crawl (for resume state).
 								    #[serde(skip)]
 								    pub visited: HashSet<String>,
 								    /// Remaining frontier when crawl was cancelled (for resume state).
 								    #[serde(skip)]
 								    pub remaining_frontier: Vec<(String, usize)>,
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								}
 								/// Outcome of extracting a single page during the crawl.
 								#[derive(Debug, Clone, Serialize, Deserialize)]
 								pub struct PageResult {
 								    pub url: String,
 								    pub depth: usize,
 								    pub extraction: Option<webclaw_core::ExtractionResult>,
 								    pub error: Option<String>,
 								    #[serde(skip)]
 								    pub elapsed: Duration,
 								}
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								/// Serializable crawl state for resume after Ctrl+C cancellation.
 								#[derive(Debug, Clone, Serialize, Deserialize)]
 								pub struct CrawlState {
 								    pub seed_url: String,
 								    pub visited: Vec<String>,
 								    pub frontier: Vec<(String, usize)>,
 								    pub completed_pages: usize,
 								    pub max_pages: usize,
 								    pub max_depth: usize,
 								}
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								/// Recursive crawler that wraps a shared [`FetchClient`].
 								pub struct Crawler {
 								    client: Arc<FetchClient>,
 								    config: CrawlConfig,
 								    seed_origin: String,
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								    /// Root domain of the seed URL for subdomain matching (e.g. "example.com").
 								    seed_root_domain: String,
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								}
 								impl Crawler {
 								    /// Build a new crawler from a seed URL and config.
 								    /// Constructs the underlying `FetchClient` from `config.fetch`.
 								    pub fn new(seed_url: &str, config: CrawlConfig) -> Result<Self, FetchError> {
 								        let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?;
 								        let seed_origin = origin_key(&seed);
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								        let seed_root_domain = root_domain(&seed);
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
-												feat(fetch,llm): DoS hardening + glob validation + cleanup (P2) (#22)

* feat(fetch,llm): DoS hardening via response caps + glob validation (P2)

Response body caps:
- webclaw-fetch::Response::from_wreq now rejects bodies over 50 MB. Checks
  Content-Length up front (before the allocation) and the actual
  .bytes() length after (belt-and-braces against lying upstreams).
  Previously the HTML -> markdown conversion downstream could allocate
  multiple String copies per page; a 100 MB page would OOM the process.
- webclaw-llm providers (anthropic/openai/ollama) share a new
  response_json_capped helper with a 5 MB cap. Protects against a
  malicious or runaway provider response exhausting memory.

Crawler frontier cap: after each BFS depth level the frontier is
truncated to max(max_pages * 10, 100) entries, keeping the most
recently discovered links. Dense pages (tag clouds, search results)
used to push the frontier into the tens of thousands even after
max_pages halted new fetches.

Glob pattern validation: user-supplied include_patterns /
exclude_patterns are rejected at Crawler::new if they contain more
than 4 `**` wildcards or exceed 1024 chars. The backtracking matcher
degrades exponentially on deeply-nested `**` against long paths.

Cleanup:
- Removed blanket #![allow(dead_code)] from webclaw-cli/src/main.rs;
  no warnings surfaced, the suppression was obsolete.
- core/.gitignore: replaced overbroad *.json with specific local-
  artifact patterns (previous rule would have swallowed package.json,
  components.json, .smithery/*.json).

Tests: +4 validate_glob tests. Full workspace test: 283 passed
(webclaw-core + webclaw-fetch + webclaw-llm).

Version: 0.3.15 -> 0.3.16
CHANGELOG updated.

Refs: docs/AUDIT-2026-04-16.md (P2 section)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* chore: gitignore CLI research dumps, drop accidentally-tracked file

research-*.json output from `webclaw ... --research ...` got silently
swept into git by the relaxed *.json gitignore in the preceding commit.
The old blanket *.json rule was hiding both this legitimate scratch
file AND packages/create-webclaw/server.json (MCP registry config that
we DO want tracked).

Removes the research dump from git and adds a narrower research-*.json
ignore pattern so future CLI output doesn't get re-tracked by accident.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
											
										
										
											2026-04-16 19:44:08 +02:00
+								        // Reject pathological user-supplied glob patterns before they can
 								        // exercise the recursive `**` handler in glob_match_inner. The
 								        // matcher is a straight backtracking implementation; a deeply
 								        // nested `**/**/**/...` pattern against a long path can degrade
 								        // to exponential time per link checked, per page crawled.
 								        for pat in config
 								            .include_patterns
 								            .iter()
 								            .chain(config.exclude_patterns.iter())
 								        {
 								            validate_glob(pat)?;
 								        }
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								        let client = FetchClient::new(config.fetch.clone())?;
 								        Ok(Self {
 								            client: Arc::new(client),
 								            config,
 								            seed_origin,
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								            seed_root_domain,
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								        })
 								    }
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								    /// Save current crawl state to a JSON file for later resume.
 								    pub fn save_state(
 								        path: &Path,
 								        seed_url: &str,
 								        visited: &HashSet<String>,
 								        frontier: &[(String, usize)],
 								        completed_pages: usize,
 								        max_pages: usize,
 								        max_depth: usize,
 								    ) -> Result<(), String> {
 								        let state = CrawlState {
 								            seed_url: seed_url.to_string(),
 								            visited: visited.iter().cloned().collect(),
 								            frontier: frontier.to_vec(),
 								            completed_pages,
 								            max_pages,
 								            max_depth,
 								        };
 								        let json =
 								            serde_json::to_string_pretty(&state).map_err(|e| format!("serialize state: {e}"))?;
 								        std::fs::write(path, json).map_err(|e| format!("write state to {}: {e}", path.display()))
 								    }
 								    /// Load crawl state from a JSON file. Returns `None` if file doesn't exist.
 								    pub fn load_state(path: &Path) -> Option<CrawlState> {
 								        let content = std::fs::read_to_string(path).ok()?;
 								        serde_json::from_str(&content).ok()
 								    }
 								    /// Returns true if the cancel flag has been set.
-												polish(fetch,mcp): robots parser + firefox client cache + Acquire ordering (P3) (#23)

Three P3 items from the 2026-04-16 audit. Bump to 0.3.17.

webclaw-fetch/sitemap.rs: parse_robots_txt used trimmed[..8] slice
plus eq_ignore_ascii_case for the directive test. That was fragile:
"Sitemap :" (space before colon) fell through silently, inline
"# ..." comments leaked into the URL, and a line with no URL at all
returned an empty string. Rewritten to split on the first colon,
match any-case "sitemap" as the directive name, strip comments, and
require `://` in the value. +7 unit tests cover case variants,
space-before-colon, comments, empty values, non-URL values, and
non-sitemap directives.

webclaw-fetch/crawler.rs: is_cancelled uses Ordering::Acquire
instead of Relaxed. Behaviourally equivalent on current hardware for
single-word atomic loads, but the explicit ordering documents intent
for readers + compilers.

webclaw-mcp/server.rs: add lazy OnceLock cache for the Firefox
FetchClient. Tool calls that repeatedly request the firefox profile
without cookies used to build a fresh reqwest pool + TLS stack per
call. Chrome (default) already used the long-lived field; Random is
per-call by design; cookie-bearing requests still build ad-hoc since
the cookie header is part of the client shape.

Tests: 85 webclaw-fetch (was 78, +7 new sitemap), 272 webclaw-core,
43 webclaw-llm, 11 CLI — all green. Clippy clean across workspace.

Refs: docs/AUDIT-2026-04-16.md P3 section

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
											
										
										
											2026-04-16 20:21:32 +02:00
+								    ///
 								    /// Uses `Acquire` load to pair with a `Release` store on the cancel
 								    /// path. `Relaxed` was technically fine in practice (x86/arm64 give
 								    /// release semantics for free on single-word stores) but `Acquire`
 								    /// makes the ordering explicit so the compiler and future readers
 								    /// don't need to reason about the memory model.
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								    fn is_cancelled(&self) -> bool {
 								        self.config
 								            .cancel_flag
 								            .as_ref()
-												polish(fetch,mcp): robots parser + firefox client cache + Acquire ordering (P3) (#23)

Three P3 items from the 2026-04-16 audit. Bump to 0.3.17.

webclaw-fetch/sitemap.rs: parse_robots_txt used trimmed[..8] slice
plus eq_ignore_ascii_case for the directive test. That was fragile:
"Sitemap :" (space before colon) fell through silently, inline
"# ..." comments leaked into the URL, and a line with no URL at all
returned an empty string. Rewritten to split on the first colon,
match any-case "sitemap" as the directive name, strip comments, and
require `://` in the value. +7 unit tests cover case variants,
space-before-colon, comments, empty values, non-URL values, and
non-sitemap directives.

webclaw-fetch/crawler.rs: is_cancelled uses Ordering::Acquire
instead of Relaxed. Behaviourally equivalent on current hardware for
single-word atomic loads, but the explicit ordering documents intent
for readers + compilers.

webclaw-mcp/server.rs: add lazy OnceLock cache for the Firefox
FetchClient. Tool calls that repeatedly request the firefox profile
without cookies used to build a fresh reqwest pool + TLS stack per
call. Chrome (default) already used the long-lived field; Random is
per-call by design; cookie-bearing requests still build ad-hoc since
the cookie header is part of the client shape.

Tests: 85 webclaw-fetch (was 78, +7 new sitemap), 272 webclaw-core,
43 webclaw-llm, 11 CLI — all green. Clippy clean across workspace.

Refs: docs/AUDIT-2026-04-16.md P3 section

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
											
										
										
											2026-04-16 20:21:32 +02:00
+								            .is_some_and(|f| f.load(Ordering::Acquire))
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								    }
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								    /// Crawl starting from `start_url`, returning results for every page visited.
 								    ///
 								    /// Uses breadth-first traversal: all pages at depth N are fetched (concurrently,
 								    /// bounded by `config.concurrency`) before moving to depth N+1.
 								    ///
 								    /// When `config.use_sitemap` is true, sitemap URLs are discovered first and
 								    /// added to the initial frontier at depth 0 alongside the seed URL.
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								    ///
 								    /// If `resume_state` is provided, the crawl resumes from the saved state
 								    /// (pre-populated visited set and frontier) instead of starting fresh.
 								    pub async fn crawl(&self, start_url: &str, resume_state: Option<CrawlState>) -> CrawlResult {
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								        let start = Instant::now();
 								        let seed = match Url::parse(start_url) {
 								            Ok(u) => u,
 								            Err(_) => {
 								                return CrawlResult {
 								                    pages: vec![PageResult {
 								                        url: start_url.to_string(),
 								                        depth: 0,
 								                        extraction: None,
 								                        error: Some(format!("invalid URL: {start_url}")),
 								                        elapsed: Duration::ZERO,
 								                    }],
 								                    total: 1,
 								                    ok: 0,
 								                    errors: 1,
 								                    elapsed_secs: 0.0,
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								                    visited: HashSet::new(),
 								                    remaining_frontier: Vec::new(),
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								                };
 								            }
 								        };
 								        let semaphore = Arc::new(Semaphore::new(self.config.concurrency));
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								        let mut visited: HashSet<String>;
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								        let mut pages: Vec<PageResult> = Vec::new();
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								        let mut frontier: Vec<(String, usize)>;
 								        // Resume from saved state or start fresh
 								        if let Some(state) = resume_state {
 								            visited = state.visited.into_iter().collect();
 								            frontier = state.frontier;
 								            info!(
 								                visited = visited.len(),
 								                frontier = frontier.len(),
 								                "resuming crawl from saved state"
 								            );
 								        } else {
 								            visited = HashSet::new();
 								            frontier = vec![(normalize(&seed), 0)];
 								            // Seed frontier from sitemap if enabled
 								            if self.config.use_sitemap {
 								                let base_url = format!("{}://{}", seed.scheme(), seed.host_str().unwrap_or(""));
 								                match sitemap::discover(&self.client, &base_url).await {
 								                    Ok(entries) => {
 								                        let before = frontier.len();
 								                        for entry in entries {
 								                            if self.qualify_link(&entry.url, &visited).is_some() {
 								                                let parsed = match Url::parse(&entry.url) {
 								                                    Ok(u) => u,
 								                                    Err(_) => continue,
 								                                };
 								                                let norm = normalize(&parsed);
 								                                frontier.push((norm, 0));
 								                            }
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								                        }
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								                        let added = frontier.len() - before;
 								                        info!(
 								                            sitemap_urls = added,
 								                            "seeded frontier from sitemap discovery"
 								                        );
 								                    }
 								                    Err(e) => {
 								                        warn!(error = %e, "sitemap discovery failed, continuing with seed URL only");
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								                    }
 								                }
 								            }
 								        }
 								        while !frontier.is_empty() && pages.len() < self.config.max_pages {
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								            // Check cancel flag before processing each batch
 								            if self.is_cancelled() {
 								                info!("crawl cancelled by user");
 								                break;
 								            }
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								            // Dedup this level's frontier against the visited set and page cap
 								            let batch: Vec<(String, usize)> = frontier
 								                .drain(..)
 								                .filter(|(url, _)| visited.insert(url.clone()))
 								                .take(self.config.max_pages.saturating_sub(pages.len()))
 								                .collect();
 								            if batch.is_empty() {
 								                break;
 								            }
 								            // Spawn one task per URL, bounded by semaphore
 								            let mut handles = Vec::with_capacity(batch.len());
 								            for (url, depth) in &batch {
 								                let permit = Arc::clone(&semaphore);
 								                let client = Arc::clone(&self.client);
 								                let url = url.clone();
 								                let depth = *depth;
 								                let delay = self.config.delay;
 								                handles.push(tokio::spawn(async move {
-												fix(fetch): surface semaphore-closed as typed error instead of panic (P1) (#21)

Three call sites in webclaw-fetch used .expect("semaphore closed") on
`Semaphore::acquire()`. Under normal operation they never fire, but
under a shutdown race or adversarial runtime state the spawned task
would panic and be silently dropped from the batch / crawl run — the
caller would see fewer results than URLs with no indication why.

Rewritten to match on the acquire result:
- client::fetch_batch and client::fetch_and_extract_batch_with_options
  now emit BatchResult/BatchExtractResult carrying
  FetchError::Build("semaphore closed before acquire").
- crawler's inner loop emits a failed PageResult with the same error
  string instead of panicking.

Behaviorally a no-op for the happy path. Fixes the silent-dropped-task
class of bug noted in the 2026-04-16 audit.

Version: 0.3.14 -> 0.3.15
CHANGELOG updated.

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
											
										
										
											2026-04-16 19:20:26 +02:00
+								                    // Acquire permit -- blocks if concurrency limit reached.
 								                    // Surface semaphore-closed as a failed PageResult rather
 								                    // than panicking the spawned task and silently dropping
 								                    // it from the batch.
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								                    let page_start = Instant::now();
-												fix(fetch): surface semaphore-closed as typed error instead of panic (P1) (#21)

Three call sites in webclaw-fetch used .expect("semaphore closed") on
`Semaphore::acquire()`. Under normal operation they never fire, but
under a shutdown race or adversarial runtime state the spawned task
would panic and be silently dropped from the batch / crawl run — the
caller would see fewer results than URLs with no indication why.

Rewritten to match on the acquire result:
- client::fetch_batch and client::fetch_and_extract_batch_with_options
  now emit BatchResult/BatchExtractResult carrying
  FetchError::Build("semaphore closed before acquire").
- crawler's inner loop emits a failed PageResult with the same error
  string instead of panicking.

Behaviorally a no-op for the happy path. Fixes the silent-dropped-task
class of bug noted in the 2026-04-16 audit.

Version: 0.3.14 -> 0.3.15
CHANGELOG updated.

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
											
										
										
											2026-04-16 19:20:26 +02:00
+								                    let result = match permit.acquire().await {
 								                        Ok(_permit) => {
 								                            tokio::time::sleep(delay).await;
 								                            client.fetch_and_extract(&url).await
 								                        }
 								                        Err(_) => {
 								                            warn!(url = %url, depth, "semaphore closed before acquire");
 								                            return PageResult {
 								                                url,
 								                                depth,
 								                                extraction: None,
 								                                error: Some("semaphore closed before acquire".into()),
 								                                elapsed: page_start.elapsed(),
 								                            };
 								                        }
 								                    };
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								                    let elapsed = page_start.elapsed();
 								                    match result {
 								                        Ok(extraction) => {
 								                            debug!(
 								                                url = %url, depth,
 								                                elapsed_ms = %elapsed.as_millis(),
 								                                "page extracted"
 								                            );
 								                            PageResult {
 								                                url,
 								                                depth,
 								                                extraction: Some(extraction),
 								                                error: None,
 								                                elapsed,
 								                            }
 								                        }
 								                        Err(e) => {
 								                            warn!(url = %url, depth, error = %e, "page failed");
 								                            PageResult {
 								                                url,
 								                                depth,
 								                                extraction: None,
 								                                error: Some(e.to_string()),
 								                                elapsed,
 								                            }
 								                        }
 								                    }
 								                }));
 								            }
 								            // Collect results and harvest links for the next depth level
 								            let mut next_frontier: Vec<(String, usize)> = Vec::new();
 								            for handle in handles {
 								                let page = match handle.await {
 								                    Ok(page) => page,
 								                    Err(e) => {
 								                        warn!(error = %e, "crawl task panicked");
 								                        continue;
 								                    }
 								                };
 								                let depth = page.depth;
 								                if depth < self.config.max_depth
 								                    && let Some(ref extraction) = page.extraction
 								                {
 								                    for link in &extraction.content.links {
 								                        if let Some(candidate) = self.qualify_link(&link.href, &visited) {
 								                            next_frontier.push((candidate, depth + 1));
 								                        }
 								                    }
 								                }
 								                // Stream progress if a channel is configured
 								                if let Some(tx) = &self.config.progress_tx {
 								                    let _ = tx.send(page.clone());
 								                }
 								                pages.push(page);
 								                if pages.len() >= self.config.max_pages {
 								                    break;
 								                }
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
 								                // Check cancel flag between page results
 								                if self.is_cancelled() {
 								                    info!("crawl cancelled by user (mid-batch)");
 								                    break;
 								                }
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								            }
-												feat(fetch,llm): DoS hardening + glob validation + cleanup (P2) (#22)

* feat(fetch,llm): DoS hardening via response caps + glob validation (P2)

Response body caps:
- webclaw-fetch::Response::from_wreq now rejects bodies over 50 MB. Checks
  Content-Length up front (before the allocation) and the actual
  .bytes() length after (belt-and-braces against lying upstreams).
  Previously the HTML -> markdown conversion downstream could allocate
  multiple String copies per page; a 100 MB page would OOM the process.
- webclaw-llm providers (anthropic/openai/ollama) share a new
  response_json_capped helper with a 5 MB cap. Protects against a
  malicious or runaway provider response exhausting memory.

Crawler frontier cap: after each BFS depth level the frontier is
truncated to max(max_pages * 10, 100) entries, keeping the most
recently discovered links. Dense pages (tag clouds, search results)
used to push the frontier into the tens of thousands even after
max_pages halted new fetches.

Glob pattern validation: user-supplied include_patterns /
exclude_patterns are rejected at Crawler::new if they contain more
than 4 `**` wildcards or exceed 1024 chars. The backtracking matcher
degrades exponentially on deeply-nested `**` against long paths.

Cleanup:
- Removed blanket #![allow(dead_code)] from webclaw-cli/src/main.rs;
  no warnings surfaced, the suppression was obsolete.
- core/.gitignore: replaced overbroad *.json with specific local-
  artifact patterns (previous rule would have swallowed package.json,
  components.json, .smithery/*.json).

Tests: +4 validate_glob tests. Full workspace test: 283 passed
(webclaw-core + webclaw-fetch + webclaw-llm).

Version: 0.3.15 -> 0.3.16
CHANGELOG updated.

Refs: docs/AUDIT-2026-04-16.md (P2 section)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* chore: gitignore CLI research dumps, drop accidentally-tracked file

research-*.json output from `webclaw ... --research ...` got silently
swept into git by the relaxed *.json gitignore in the preceding commit.
The old blanket *.json rule was hiding both this legitimate scratch
file AND packages/create-webclaw/server.json (MCP registry config that
we DO want tracked).

Removes the research dump from git and adds a narrower research-*.json
ignore pattern so future CLI output doesn't get re-tracked by accident.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
											
										
										
											2026-04-16 19:44:08 +02:00
+								            // Cap frontier size independently of max_pages. Pages like
 								            // search-result listings or tag clouds can emit thousands of
 								            // links per page; without this a single dense page could push
 								            // the frontier into the tens of thousands of entries and keep
 								            // String allocations alive even after max_pages halts crawling.
 								            // Trim aggressively once we exceed 10× max_pages, keeping the
 								            // most recently discovered entries which are still on-topic
 								            // (breadth-first = siblings of the last page we saw).
 								            let frontier_cap = self.config.max_pages.saturating_mul(10).max(100);
 								            if next_frontier.len() > frontier_cap {
 								                let keep = self.config.max_pages.saturating_mul(5).max(50);
 								                warn!(
 								                    frontier = next_frontier.len(),
 								                    cap = frontier_cap,
 								                    trimmed_to = keep,
 								                    "frontier exceeded cap, truncating"
 								                );
 								                next_frontier.truncate(keep);
 								            }
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								            frontier = next_frontier;
 								        }
 								        let total_elapsed = start.elapsed();
 								        let ok_count = pages.iter().filter(|p| p.extraction.is_some()).count();
 								        let err_count = pages.len() - ok_count;
 								        info!(
 								            total = pages.len(),
 								            ok = ok_count,
 								            errors = err_count,
 								            elapsed_ms = %total_elapsed.as_millis(),
 								            "crawl complete"
 								        );
 								        CrawlResult {
 								            total: pages.len(),
 								            ok: ok_count,
 								            errors: err_count,
 								            elapsed_secs: total_elapsed.as_secs_f64(),
-												feat: v0.1.3 — crawl streaming, resume/cancel, MCP proxy support

Crawl:
- Real-time progress on stderr as pages complete
- --crawl-state saves progress on Ctrl+C, resumes from saved state
- Visited set + remaining frontier persisted for accurate resume

MCP server:
- Reads WEBCLAW_PROXY and WEBCLAW_PROXY_FILE env vars
- Falls back to proxies.txt in CWD (existing behavior)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-25 21:38:28 +01:00
+								            remaining_frontier: frontier,
 								            visited,
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								            pages,
 								        }
 								    }
 								    /// Check if a discovered link should be added to the frontier.
 								    /// Returns `Some(normalized_url)` if it passes all filters, `None` otherwise.
 								    fn qualify_link(&self, href: &str, visited: &HashSet<String>) -> Option<String> {
 								        let parsed = Url::parse(href).ok()?;
 								        // Only http(s) schemes
 								        match parsed.scheme() {
 								            "http" | "https" => {}
 								            _ => return None,
 								        }
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								        // Scope check: same-origin, subdomain, or external
 								        if !self.config.allow_external_links {
 								            let link_origin = origin_key(&parsed);
 								            if link_origin != self.seed_origin {
 								                // Not same-origin. Check if subdomain crawling is allowed.
 								                if self.config.allow_subdomains {
 								                    let link_root = root_domain(&parsed);
 								                    if link_root != self.seed_root_domain {
 								                        return None;
 								                    }
 								                } else {
 								                    return None;
 								                }
 								            }
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								        }
 								        // Path prefix filter
 								        if let Some(ref prefix) = self.config.path_prefix
 								            && !parsed.path().starts_with(prefix.as_str())
 								        {
 								            return None;
 								        }
 								        // Include patterns: if any are set, path must match at least one
 								        let path = parsed.path();
 								        if !self.config.include_patterns.is_empty()
 								            && !self
 								                .config
 								                .include_patterns
 								                .iter()
 								                .any(|pat| glob_match(pat, path))
 								        {
 								            return None;
 								        }
 								        // Exclude patterns: if path matches any, skip
 								        if self
 								            .config
 								            .exclude_patterns
 								            .iter()
 								            .any(|pat| glob_match(pat, path))
 								        {
 								            return None;
 								        }
 								        // Skip common non-page file extensions
 								        const SKIP_EXTENSIONS: &[&str] = &[
 								            ".pdf", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico", ".css", ".js",
 								            ".zip", ".tar", ".gz", ".xml", ".rss", ".mp3", ".mp4", ".avi", ".mov", ".woff",
 								            ".woff2", ".ttf", ".eot",
 								        ];
 								        if SKIP_EXTENSIONS.iter().any(|ext| path.ends_with(ext)) {
 								            return None;
 								        }
 								        let normalized = normalize(&parsed);
 								        if visited.contains(&normalized) {
 								            return None;
 								        }
 								        Some(normalized)
 								    }
 								}
 								/// Canonical origin string for comparing same-origin: "scheme://host[:port]".
 								fn origin_key(url: &Url) -> String {
 								    let port_suffix = match url.port() {
 								        Some(p) => format!(":{p}"),
 								        None => String::new(),
 								    };
 								    let host = url.host_str().unwrap_or("");
 								    let host = host.strip_prefix("www.").unwrap_or(host);
 								    format!("{}://{}{}", url.scheme(), host, port_suffix)
 								}
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								/// Extract the root domain from a URL for subdomain comparison.
 								/// "blog.docs.example.com" -> "example.com", "example.co.uk" -> "example.co.uk" (best-effort).
 								///
 								/// Uses a simple heuristic: take the last two labels, or three if the second-to-last
 								/// is short (<=3 chars, likely a country SLD like "co.uk", "com.au").
 								fn root_domain(url: &Url) -> String {
 								    let host = url.host_str().unwrap_or("");
 								    let host = host.strip_prefix("www.").unwrap_or(host);
 								    let labels: Vec<&str> = host.split('.').collect();
 								    if labels.len() <= 2 {
 								        return host.to_ascii_lowercase();
 								    }
 								    // Heuristic for two-part TLDs (co.uk, com.au, org.br, etc.)
 								    let sld = labels[labels.len() - 2];
 								    if labels.len() >= 3 && sld.len() <= 3 {
 								        labels[labels.len() - 3..].join(".").to_ascii_lowercase()
 								    } else {
 								        labels[labels.len() - 2..].join(".").to_ascii_lowercase()
 								    }
 								}
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
 								/// lowercase scheme + host. Preserves query params and path case.
 								fn normalize(url: &Url) -> String {
 								    let scheme = url.scheme();
 								    let host = url.host_str().unwrap_or("").to_ascii_lowercase();
 								    let port_suffix = match url.port() {
 								        Some(p) => format!(":{p}"),
 								        None => String::new(),
 								    };
 								    let mut path = url.path().to_string();
 								    if path.len() > 1 && path.ends_with('/') {
 								        path.pop();
 								    }
 								    let query = match url.query() {
 								        Some(q) => format!("?{q}"),
 								        None => String::new(),
 								    };
 								    // Fragment intentionally omitted
 								    format!("{scheme}://{host}{port_suffix}{path}{query}")
 								}
-												feat(fetch,llm): DoS hardening + glob validation + cleanup (P2) (#22)

* feat(fetch,llm): DoS hardening via response caps + glob validation (P2)

Response body caps:
- webclaw-fetch::Response::from_wreq now rejects bodies over 50 MB. Checks
  Content-Length up front (before the allocation) and the actual
  .bytes() length after (belt-and-braces against lying upstreams).
  Previously the HTML -> markdown conversion downstream could allocate
  multiple String copies per page; a 100 MB page would OOM the process.
- webclaw-llm providers (anthropic/openai/ollama) share a new
  response_json_capped helper with a 5 MB cap. Protects against a
  malicious or runaway provider response exhausting memory.

Crawler frontier cap: after each BFS depth level the frontier is
truncated to max(max_pages * 10, 100) entries, keeping the most
recently discovered links. Dense pages (tag clouds, search results)
used to push the frontier into the tens of thousands even after
max_pages halted new fetches.

Glob pattern validation: user-supplied include_patterns /
exclude_patterns are rejected at Crawler::new if they contain more
than 4 `**` wildcards or exceed 1024 chars. The backtracking matcher
degrades exponentially on deeply-nested `**` against long paths.

Cleanup:
- Removed blanket #![allow(dead_code)] from webclaw-cli/src/main.rs;
  no warnings surfaced, the suppression was obsolete.
- core/.gitignore: replaced overbroad *.json with specific local-
  artifact patterns (previous rule would have swallowed package.json,
  components.json, .smithery/*.json).

Tests: +4 validate_glob tests. Full workspace test: 283 passed
(webclaw-core + webclaw-fetch + webclaw-llm).

Version: 0.3.15 -> 0.3.16
CHANGELOG updated.

Refs: docs/AUDIT-2026-04-16.md (P2 section)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* chore: gitignore CLI research dumps, drop accidentally-tracked file

research-*.json output from `webclaw ... --research ...` got silently
swept into git by the relaxed *.json gitignore in the preceding commit.
The old blanket *.json rule was hiding both this legitimate scratch
file AND packages/create-webclaw/server.json (MCP registry config that
we DO want tracked).

Removes the research dump from git and adds a narrower research-*.json
ignore pattern so future CLI output doesn't get re-tracked by accident.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
											
										
										
											2026-04-16 19:44:08 +02:00
+								/// Maximum number of `**` wildcards allowed in a single user glob. Each
 								/// additional `**` multiplies the backtracking fan-out of `glob_match_inner`
 								/// against adversarial paths; 4 is a practical ceiling for legitimate
 								/// nested include/exclude patterns and still keeps the matcher linear-ish.
 								const MAX_GLOB_DOUBLESTAR: usize = 4;
 								/// Maximum glob pattern length. Keeps a single pattern from taking
 								/// megabytes of RAM if someone copy-pastes garbage into --include.
 								const MAX_GLOB_LEN: usize = 1024;
 								/// Validate a user-supplied glob pattern before it hits the matcher.
 								/// Rejects patterns that would drive `glob_match_inner` into pathological
 								/// backtracking (too many `**`, excessive length).
 								fn validate_glob(pat: &str) -> Result<(), FetchError> {
 								    if pat.len() > MAX_GLOB_LEN {
 								        return Err(FetchError::Build(format!(
 								            "glob pattern exceeds {MAX_GLOB_LEN} chars ({} given)",
 								            pat.len()
 								        )));
 								    }
 								    // Count non-overlapping occurrences of `**`.
 								    let bytes = pat.as_bytes();
 								    let mut count = 0usize;
 								    let mut i = 0;
 								    while i + 1 < bytes.len() {
 								        if bytes[i] == b'*' && bytes[i + 1] == b'*' {
 								            count += 1;
 								            // Skip run of consecutive `*` so `***` counts as one.
 								            while i < bytes.len() && bytes[i] == b'*' {
 								                i += 1;
 								            }
 								        } else {
 								            i += 1;
 								        }
 								    }
 								    if count > MAX_GLOB_DOUBLESTAR {
 								        return Err(FetchError::Build(format!(
 								            "glob pattern has {count} `**` wildcards (max {MAX_GLOB_DOUBLESTAR})"
 								        )));
 								    }
 								    Ok(())
 								}
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								/// Simple glob matching for URL paths. Supports:
 								/// - `*` matches any characters within a single path segment (no `/`)
 								/// - `**` matches any characters including `/` (any number of segments)
 								/// - Literal characters match exactly
 								///
 								/// Examples:
 								/// - `/api/*` matches `/api/users` but not `/api/users/123`
 								/// - `/api/**` matches `/api/users`, `/api/users/123`, `/api/a/b/c`
 								/// - `/docs/*/intro` matches `/docs/v2/intro`
 								fn glob_match(pattern: &str, path: &str) -> bool {
 								    glob_match_inner(pattern.as_bytes(), path.as_bytes())
 								}
 								fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
 								    let mut pi = 0;
 								    let mut ti = 0;
 								    let mut star_pi = usize::MAX;
 								    let mut star_ti = 0;
 								    while ti < text.len() {
 								        if pi < pat.len() && pat[pi] == b'*' && pi + 1 < pat.len() && pat[pi + 1] == b'*' {
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								            // `**` -- match everything including slashes
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								            // Skip all consecutive `*`
 								            while pi < pat.len() && pat[pi] == b'*' {
 								                pi += 1;
 								            }
 								            // Skip trailing `/` after `**`
 								            if pi < pat.len() && pat[pi] == b'/' {
 								                pi += 1;
 								            }
 								            if pi >= pat.len() {
 								                return true; // `**` at end matches everything
 								            }
 								            // Try matching the rest of pattern against every suffix of text
 								            for start in ti..=text.len() {
 								                if glob_match_inner(&pat[pi..], &text[start..]) {
 								                    return true;
 								                }
 								            }
 								            return false;
 								        } else if pi < pat.len() && pat[pi] == b'*' {
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								            // `*` -- match any chars except `/`
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								            star_pi = pi;
 								            star_ti = ti;
 								            pi += 1;
 								        } else if pi < pat.len() && (pat[pi] == text[ti] || pat[pi] == b'?') {
 								            pi += 1;
 								            ti += 1;
 								        } else if star_pi != usize::MAX {
 								            // Backtrack: `*` absorbs one more char (but not `/`)
 								            if text[star_ti] == b'/' {
 								                return false;
 								            }
 								            star_ti += 1;
 								            ti = star_ti;
 								            pi = star_pi + 1;
 								        } else {
 								            return false;
 								        }
 								    }
 								    // Consume trailing `*` or `**` in pattern
 								    while pi < pat.len() && pat[pi] == b'*' {
 								        pi += 1;
 								    }
 								    pi >= pat.len()
 								}
 								#[cfg(test)]
 								mod tests {
 								    use super::*;
 								    #[test]
 								    fn normalize_strips_fragment() {
 								        let url = Url::parse("https://example.com/page#section").unwrap();
 								        assert_eq!(normalize(&url), "https://example.com/page");
 								    }
 								    #[test]
 								    fn normalize_strips_trailing_slash() {
 								        let url = Url::parse("https://example.com/docs/").unwrap();
 								        assert_eq!(normalize(&url), "https://example.com/docs");
 								    }
 								    #[test]
 								    fn normalize_keeps_root_slash() {
 								        let url = Url::parse("https://example.com/").unwrap();
 								        assert_eq!(normalize(&url), "https://example.com/");
 								    }
 								    #[test]
 								    fn normalize_preserves_query() {
 								        let url = Url::parse("https://example.com/search?q=rust&page=2").unwrap();
 								        assert_eq!(normalize(&url), "https://example.com/search?q=rust&page=2");
 								    }
 								    #[test]
 								    fn normalize_lowercases_host() {
 								        let url = Url::parse("https://Example.COM/Path").unwrap();
 								        assert_eq!(normalize(&url), "https://example.com/Path");
 								    }
 								    #[test]
 								    fn origin_includes_explicit_port() {
 								        let url = Url::parse("https://example.com:8443/foo").unwrap();
 								        assert_eq!(origin_key(&url), "https://example.com:8443");
 								    }
 								    #[test]
 								    fn origin_omits_default_port() {
 								        let url = Url::parse("https://example.com/foo").unwrap();
 								        assert_eq!(origin_key(&url), "https://example.com");
 								    }
 								    #[test]
 								    fn different_schemes_are_different_origins() {
 								        let http = Url::parse("http://example.com/").unwrap();
 								        let https = Url::parse("https://example.com/").unwrap();
 								        assert_ne!(origin_key(&http), origin_key(&https));
 								    }
-												feat: add allow_subdomains and allow_external_links to CrawlConfig

Crawls are same-origin by default. Enable allow_subdomains to follow
sibling/child subdomains (blog.example.com from example.com), or
allow_external_links for full cross-origin crawling.

Root domain extraction uses a heuristic that handles two-part TLDs
(co.uk, com.au). Includes 5 unit tests for root_domain().

Bump to 0.3.12.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-14 19:33:06 +02:00
+								    // -- root_domain tests --
 								    #[test]
 								    fn root_domain_simple() {
 								        let url = Url::parse("https://example.com/page").unwrap();
 								        assert_eq!(root_domain(&url), "example.com");
 								    }
 								    #[test]
 								    fn root_domain_subdomain() {
 								        let url = Url::parse("https://blog.example.com/page").unwrap();
 								        assert_eq!(root_domain(&url), "example.com");
 								    }
 								    #[test]
 								    fn root_domain_deep_subdomain() {
 								        let url = Url::parse("https://a.b.c.example.com/").unwrap();
 								        assert_eq!(root_domain(&url), "example.com");
 								    }
 								    #[test]
 								    fn root_domain_country_tld() {
 								        let url = Url::parse("https://blog.example.co.uk/").unwrap();
 								        assert_eq!(root_domain(&url), "example.co.uk");
 								    }
 								    #[test]
 								    fn root_domain_strips_www() {
 								        let url = Url::parse("https://www.example.com/").unwrap();
 								        assert_eq!(root_domain(&url), "example.com");
 								    }
-												feat(fetch,llm): DoS hardening + glob validation + cleanup (P2) (#22)

* feat(fetch,llm): DoS hardening via response caps + glob validation (P2)

Response body caps:
- webclaw-fetch::Response::from_wreq now rejects bodies over 50 MB. Checks
  Content-Length up front (before the allocation) and the actual
  .bytes() length after (belt-and-braces against lying upstreams).
  Previously the HTML -> markdown conversion downstream could allocate
  multiple String copies per page; a 100 MB page would OOM the process.
- webclaw-llm providers (anthropic/openai/ollama) share a new
  response_json_capped helper with a 5 MB cap. Protects against a
  malicious or runaway provider response exhausting memory.

Crawler frontier cap: after each BFS depth level the frontier is
truncated to max(max_pages * 10, 100) entries, keeping the most
recently discovered links. Dense pages (tag clouds, search results)
used to push the frontier into the tens of thousands even after
max_pages halted new fetches.

Glob pattern validation: user-supplied include_patterns /
exclude_patterns are rejected at Crawler::new if they contain more
than 4 `**` wildcards or exceed 1024 chars. The backtracking matcher
degrades exponentially on deeply-nested `**` against long paths.

Cleanup:
- Removed blanket #![allow(dead_code)] from webclaw-cli/src/main.rs;
  no warnings surfaced, the suppression was obsolete.
- core/.gitignore: replaced overbroad *.json with specific local-
  artifact patterns (previous rule would have swallowed package.json,
  components.json, .smithery/*.json).

Tests: +4 validate_glob tests. Full workspace test: 283 passed
(webclaw-core + webclaw-fetch + webclaw-llm).

Version: 0.3.15 -> 0.3.16
CHANGELOG updated.

Refs: docs/AUDIT-2026-04-16.md (P2 section)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* chore: gitignore CLI research dumps, drop accidentally-tracked file

research-*.json output from `webclaw ... --research ...` got silently
swept into git by the relaxed *.json gitignore in the preceding commit.
The old blanket *.json rule was hiding both this legitimate scratch
file AND packages/create-webclaw/server.json (MCP registry config that
we DO want tracked).

Removes the research dump from git and adds a narrower research-*.json
ignore pattern so future CLI output doesn't get re-tracked by accident.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
											
										
										
											2026-04-16 19:44:08 +02:00
+								    // -- validate_glob tests --
 								    #[test]
 								    fn validate_glob_accepts_reasonable_patterns() {
 								        assert!(validate_glob("/api/*").is_ok());
 								        assert!(validate_glob("/api/**").is_ok());
 								        assert!(validate_glob("/docs/**/page-*.html").is_ok());
 								        assert!(validate_glob("/a/**/b/**/c/**/d/**").is_ok());
 								    }
 								    #[test]
 								    fn validate_glob_rejects_too_many_doublestars() {
 								        // 5 `**` exceeds MAX_GLOB_DOUBLESTAR = 4.
 								        let pat = "/a/**/b/**/c/**/d/**/e/**";
 								        let err = validate_glob(pat).unwrap_err();
 								        assert!(matches!(err, FetchError::Build(ref m) if m.contains("`**` wildcards")));
 								    }
 								    #[test]
 								    fn validate_glob_treats_triple_star_as_one() {
 								        // `***` is still one run, should not count as 2.
 								        assert!(validate_glob("/a/***/b/***/c/***/d/***").is_ok());
 								    }
 								    #[test]
 								    fn validate_glob_rejects_oversized_pattern() {
 								        let giant = "x".repeat(2048);
 								        let err = validate_glob(&giant).unwrap_err();
 								        assert!(matches!(err, FetchError::Build(ref m) if m.contains("exceeds")));
 								    }
-												Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io

											
										
										
											2026-03-23 18:31:11 +01:00
+								    // -- glob_match tests --
 								    #[test]
 								    fn glob_star_matches_single_segment() {
 								        assert!(glob_match("/api/*", "/api/users"));
 								        assert!(glob_match("/api/*", "/api/products"));
 								        assert!(!glob_match("/api/*", "/api/users/123"));
 								    }
 								    #[test]
 								    fn glob_doublestar_matches_multiple_segments() {
 								        assert!(glob_match("/api/**", "/api/users"));
 								        assert!(glob_match("/api/**", "/api/users/123"));
 								        assert!(glob_match("/api/**", "/api/a/b/c/d"));
 								        assert!(!glob_match("/api/**", "/docs/intro"));
 								    }
 								    #[test]
 								    fn glob_exact_match() {
 								        assert!(glob_match("/about", "/about"));
 								        assert!(!glob_match("/about", "/about/team"));
 								    }
 								    #[test]
 								    fn glob_middle_wildcard() {
 								        assert!(glob_match("/docs/*/intro", "/docs/v2/intro"));
 								        assert!(!glob_match("/docs/*/intro", "/docs/v2/v3/intro"));
 								    }
 								    #[test]
 								    fn glob_no_pattern_matches_nothing() {
 								        // Empty pattern only matches empty string
 								        assert!(glob_match("", ""));
 								        assert!(!glob_match("", "/foo"));
 								    }
 								    #[test]
 								    fn glob_trailing_star() {
 								        assert!(glob_match("/blog*", "/blog"));
 								        assert!(glob_match("/blog*", "/blog-post"));
 								        assert!(!glob_match("/blog*", "/blog/post")); // * doesn't cross /
 								    }
 								}