mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-19 01:58:06 +02:00
feat(map): layered URL discovery with bounded crawl fallback
Rescued from the stale perf/audit-fixes branch and ported cleanly onto current main (fetch + CLI only — the original commit never touched the server/MCP map surfaces). `--map` used to return only what a site advertises in sitemap.xml, which is nothing for sites with no sitemap (e.g. Hacker News) or a thin one. Now discovery is layered: - webclaw-fetch::discover_urls() / MapOptions — sitemaps first (authoritative, carries lastmod/priority/changefreq); when the sitemap is thin (< min_sitemap_urls) and the fallback is enabled, run a bounded same-origin crawl and harvest links from every fetched page plus the unfetched frontier, deduped against the sitemap set. - sitemap.rs: gzip (.xml.gz) support via a new decode_sitemap_body() + FetchClient::fetch_raw() (raw bytes, no lossy UTF-8); deeper index recursion (3->5); 4 more fallback paths. - CLI: --map-pages / --no-map-crawl / --map-limit; crawler logs now go to stderr so `--map -f json` stays machine-parseable. One new dependency: flate2 (already resolved in the lockfile transitively). Includes the commit's unit tests (map dedup/origin, gzip decode). Original work by the prior author on perf/audit-fixes; this re-applies only the map slice onto main. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c3e5ef5143
commit
179efbcf87
8 changed files with 485 additions and 12 deletions
|
|
@ -313,6 +313,18 @@ struct Cli {
|
|||
#[arg(long)]
|
||||
map: bool,
|
||||
|
||||
/// Max pages for --map's crawl fallback when the sitemap is thin [default: 150]
|
||||
#[arg(long)]
|
||||
map_pages: Option<usize>,
|
||||
|
||||
/// Disable --map's crawl fallback (sitemap-only discovery)
|
||||
#[arg(long)]
|
||||
no_map_crawl: bool,
|
||||
|
||||
/// Cap the number of URLs --map returns (default: uncapped)
|
||||
#[arg(long)]
|
||||
map_limit: Option<usize>,
|
||||
|
||||
// -- LLM options --
|
||||
/// Extract structured JSON using LLM (pass a JSON schema string or @file)
|
||||
#[arg(long)]
|
||||
|
|
@ -508,7 +520,13 @@ fn init_logging(verbose: bool) {
|
|||
EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new(default))
|
||||
};
|
||||
|
||||
tracing_subscriber::fmt().with_env_filter(filter).init();
|
||||
// Logs go to stderr, never stdout: stdout carries the actual result
|
||||
// (markdown / JSON / URL list). A stray WARN on stdout corrupts
|
||||
// machine-readable output — e.g. `--map --format json` piped to a parser.
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(filter)
|
||||
.with_writer(std::io::stderr)
|
||||
.init();
|
||||
}
|
||||
|
||||
/// Build FetchConfig from CLI flags.
|
||||
|
|
@ -1688,12 +1706,22 @@ async fn run_map(cli: &Cli) -> Result<(), String> {
|
|||
let client =
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
||||
|
||||
let entries = webclaw_fetch::sitemap::discover(&client, url)
|
||||
.await
|
||||
.map_err(|e| format!("sitemap discovery failed: {e}"))?;
|
||||
// Layered discovery: sitemaps first, bounded crawl fallback when thin.
|
||||
let mut opts = webclaw_fetch::MapOptions::default();
|
||||
if let Some(pages) = cli.map_pages {
|
||||
opts.max_crawl_pages = pages;
|
||||
}
|
||||
if cli.no_map_crawl {
|
||||
opts.crawl_fallback = false;
|
||||
}
|
||||
if let Some(limit) = cli.map_limit {
|
||||
opts.max_urls = Some(limit);
|
||||
}
|
||||
|
||||
let entries = webclaw_fetch::discover_urls(&client, url, &opts).await;
|
||||
|
||||
if entries.is_empty() {
|
||||
eprintln!("no sitemap URLs found for {url}");
|
||||
eprintln!("no URLs found for {url}");
|
||||
} else {
|
||||
eprintln!("discovered {} URLs", entries.len());
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue