perf(core): hot-path extraction speedups + senior-grade hardening

Extraction ~22% faster on the corpus benchmark with byte-identical output:
- hoist recompiled CSS selectors in the markdown noise path
- single-pass shared og() meta parsing across vertical extractors
- output-safe QuickJS gating (skip the JS VM when no candidate data) +
  reuse the already-parsed document instead of re-parsing
- wreq connect_timeout + connection-pool tuning; dedup the retry loop

Reliability + correctness:
- char-boundary-safe truncation of LLM error bodies (shared helper)
- HTTP connect/read timeouts on all LLM provider clients
- isolate pdf-extract behind catch_unwind + spawn_blocking
- OSS server: crawl inherits the shared fetch profile; ProviderChain built
  once in AppState; request TimeoutLayer

API / safety / docs:
- #[non_exhaustive] on public enums + result structs (+ builders)
- #![forbid(unsafe_code)] on pure crates, deny on llm
- //! crate docs + doctests; scrub bypass/vendor/target specifics from
  public crate docs and comments

Tooling: [profile.release] lto/codegen-units/strip, MSRV pin, deny.toml +
cargo-deny CI, macOS test matrix. CLI main.rs split into focused modules.
This commit is contained in:
webclaw 2026-06-04 20:22:00 +02:00
parent e499e51e70
commit 02302e7a1d
62 changed files with 3761 additions and 3130 deletions

View file

@ -3,8 +3,12 @@ name = "webclaw-cli"
description = "CLI for extracting web content into LLM-optimized formats"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
[lints]
workspace = true
[[bin]]
name = "webclaw"
path = "src/main.rs"

View file

@ -48,7 +48,10 @@ async fn main() {
match mode.as_str() {
"capture" => capture().await,
"bench" => {
let iters: usize = std::env::args().nth(2).and_then(|s| s.parse().ok()).unwrap_or(60);
let iters: usize = std::env::args()
.nth(2)
.and_then(|s| s.parse().ok())
.unwrap_or(60);
bench(iters);
}
"snapshot" => {
@ -64,14 +67,21 @@ async fn main() {
async fn capture() {
fs::create_dir_all(CORPUS).unwrap();
let config = FetchConfig { browser: BrowserProfile::Chrome, ..FetchConfig::default() };
let config = FetchConfig {
browser: BrowserProfile::Chrome,
..FetchConfig::default()
};
let client = FetchClient::new(config).expect("build client");
let mut ok = 0;
for (i, u) in URLS.iter().enumerate() {
let name = format!(
"{:02}_{}.html",
i + 1,
u.replace("https://", "").chars().map(|c| if c.is_alphanumeric() { c } else { '_' }).take(40).collect::<String>()
u.replace("https://", "")
.chars()
.map(|c| if c.is_alphanumeric() { c } else { '_' })
.take(40)
.collect::<String>()
);
match client.fetch(u).await {
Ok(f) if f.html.len() > 1000 => {
@ -99,7 +109,9 @@ fn snapshot(label: &str) {
let mut n = 0;
for path in &files {
let html = fs::read_to_string(path).unwrap_or_default();
if html.is_empty() { continue; }
if html.is_empty() {
continue;
}
let stem = path.file_stem().unwrap().to_string_lossy().to_string();
let url = format!("https://corpus/{stem}");
match extract(&html, Some(&url)) {
@ -117,7 +129,9 @@ fn snapshot(label: &str) {
}
fn percentile(sorted: &[u128], p: f64) -> u128 {
if sorted.is_empty() { return 0; }
if sorted.is_empty() {
return 0;
}
let idx = ((sorted.len() as f64 - 1.0) * p).round() as usize;
sorted[idx]
}
@ -135,7 +149,10 @@ fn bench(iters: usize) {
}
println!("# perf_corpus bench docs={} iters={}", files.len(), iters);
println!("{:<42} {:>10} {:>10} {:>10} {:>10}", "doc(KB)", "extract_us", "llm_us", "p50_us", "p90_us");
println!(
"{:<42} {:>10} {:>10} {:>10} {:>10}",
"doc(KB)", "extract_us", "llm_us", "p50_us", "p90_us"
);
let mut grand_extract = 0u128;
let mut grand_llm = 0u128;
@ -143,8 +160,13 @@ fn bench(iters: usize) {
for path in &files {
let html = fs::read_to_string(path).unwrap_or_default();
if html.is_empty() { continue; }
let url = format!("https://corpus/{}", path.file_name().unwrap().to_string_lossy());
if html.is_empty() {
continue;
}
let url = format!(
"https://corpus/{}",
path.file_name().unwrap().to_string_lossy()
);
// warmup
for _ in 0..5 {
@ -158,7 +180,10 @@ fn bench(iters: usize) {
let mut total_times = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let ex = match extract(&html, Some(&url)) { Ok(e) => e, Err(_) => continue };
let ex = match extract(&html, Some(&url)) {
Ok(e) => e,
Err(_) => continue,
};
let t1 = Instant::now();
let txt = to_llm_text(&ex, Some(&url));
let t2 = Instant::now();
@ -178,11 +203,24 @@ fn bench(iters: usize) {
grand_llm += llm_p50;
grand_total_p50 += tot_p50;
let label = format!("{} ({}KB)", path.file_stem().unwrap().to_string_lossy(), html.len() / 1024);
println!("{:<42} {:>10} {:>10} {:>10} {:>10}", label.chars().take(42).collect::<String>(), ex_p50, llm_p50, tot_p50, tot_p90);
let label = format!(
"{} ({}KB)",
path.file_stem().unwrap().to_string_lossy(),
html.len() / 1024
);
println!(
"{:<42} {:>10} {:>10} {:>10} {:>10}",
label.chars().take(42).collect::<String>(),
ex_p50,
llm_p50,
tot_p50,
tot_p90
);
}
println!("---");
println!("CORPUS_PASS_P50_SUM_US extract={grand_extract} llm={grand_llm} total={grand_total_p50}");
println!(
"CORPUS_PASS_P50_SUM_US extract={grand_extract} llm={grand_llm} total={grand_total_p50}"
);
println!("(lower is better; total = one full extract+llm pass over the whole corpus at p50)");
}

View file

@ -198,7 +198,7 @@ fn fmt_int(n: usize) -> String {
let bytes = s.as_bytes();
let mut out = String::with_capacity(s.len() + s.len() / 3);
for (i, b) in bytes.iter().enumerate() {
if i > 0 && (bytes.len() - i).is_multiple_of(3) {
if i > 0 && (bytes.len() - i) % 3 == 0 {
out.push(',');
}
out.push(*b as char);

View file

@ -0,0 +1,324 @@
//! CLI argument definitions: clap structs/enums and their conversions.
use std::path::PathBuf;
use clap::{Parser, Subcommand, ValueEnum};
use webclaw_fetch::BrowserProfile;
use webclaw_pdf::PdfMode;
#[derive(Parser)]
#[command(name = "webclaw", about = "Extract web content for LLMs", version)]
pub struct Cli {
/// Optional subcommand. When omitted, the CLI falls back to the
/// traditional flag-based flow (URL + --format, --crawl, etc.).
/// Subcommands are used for flows that don't fit that model.
#[command(subcommand)]
pub command: Option<Commands>,
/// URLs to fetch (multiple allowed)
#[arg()]
pub urls: Vec<String>,
/// File with URLs (one per line)
#[arg(long)]
pub urls_file: Option<String>,
/// Output format (markdown, json, text, llm, html)
#[arg(short, long, default_value = "markdown")]
pub format: OutputFormat,
/// Browser to impersonate
#[arg(short, long, default_value = "chrome")]
pub browser: Browser,
/// Proxy URL (http://user:pass@host:port or socks5://host:port)
#[arg(short, long, env = "WEBCLAW_PROXY")]
pub proxy: Option<String>,
/// File with proxies (host:port:user:pass, one per line). Rotates per request.
#[arg(long, env = "WEBCLAW_PROXY_FILE")]
pub proxy_file: Option<String>,
/// Request timeout in seconds
#[arg(short, long, default_value = "30")]
pub timeout: u64,
/// Extract from local HTML file instead of fetching
#[arg(long)]
pub file: Option<String>,
/// Read HTML from stdin
#[arg(long)]
pub stdin: bool,
/// Include metadata in output (always included in JSON)
#[arg(long)]
pub metadata: bool,
/// Output raw fetched HTML instead of extracting
#[arg(long)]
pub raw_html: bool,
/// CSS selectors to include (comma-separated, e.g. "article,.content")
#[arg(long)]
pub include: Option<String>,
/// CSS selectors to exclude (comma-separated, e.g. "nav,.sidebar,footer")
#[arg(long)]
pub exclude: Option<String>,
/// Only extract main content (article/main element)
#[arg(long)]
pub only_main_content: bool,
/// Custom headers (repeatable, e.g. -H "Cookie: foo=bar")
#[arg(short = 'H', long = "header")]
pub headers: Vec<String>,
/// Cookie string (shorthand for -H "Cookie: ...")
#[arg(long)]
pub cookie: Option<String>,
/// JSON cookie file (Chrome extension format: [{name, value, domain, ...}])
#[arg(long)]
pub cookie_file: Option<String>,
/// Enable verbose logging
#[arg(short, long)]
pub verbose: bool,
/// Compare against a previous JSON snapshot
#[arg(long)]
pub diff_with: Option<String>,
/// Watch a URL for changes. Checks at the specified interval and reports diffs.
#[arg(long)]
pub watch: bool,
/// Watch interval in seconds [default: 300]
#[arg(long, default_value = "300")]
pub watch_interval: u64,
/// Command to run when changes are detected (receives diff JSON on stdin)
#[arg(long)]
pub on_change: Option<String>,
/// Webhook URL: POST a JSON payload when an operation completes.
/// Works with crawl, batch, watch (on change), and single URL modes.
#[arg(long, env = "WEBCLAW_WEBHOOK_URL")]
pub webhook: Option<String>,
/// Extract brand identity (colors, fonts, logo)
#[arg(long)]
pub brand: bool,
// -- PDF options --
/// PDF extraction mode: auto (error on empty) or fast (return whatever text is found)
#[arg(long, default_value = "auto")]
pub pdf_mode: PdfModeArg,
// -- Crawl options --
/// Enable recursive crawling of same-domain links
#[arg(long)]
pub crawl: bool,
/// Max crawl depth [default: 1]
#[arg(long, default_value = "1")]
pub depth: usize,
/// Max pages to crawl [default: 20]
#[arg(long, default_value = "20")]
pub max_pages: usize,
/// Max concurrent requests [default: 5]
#[arg(long, default_value = "5")]
pub concurrency: usize,
/// Delay between requests in ms [default: 100]
#[arg(long, default_value = "100")]
pub delay: u64,
/// Only crawl URLs matching this path prefix
#[arg(long)]
pub path_prefix: Option<String>,
/// Glob patterns for crawl URL paths to include (comma-separated, e.g. "/api/*,/guides/**")
#[arg(long)]
pub include_paths: Option<String>,
/// Glob patterns for crawl URL paths to exclude (comma-separated, e.g. "/changelog/*,/blog/*")
#[arg(long)]
pub exclude_paths: Option<String>,
/// Path to save/resume crawl state. On Ctrl+C: saves progress. On start: resumes if file exists.
#[arg(long)]
pub crawl_state: Option<PathBuf>,
/// Seed crawl frontier from sitemap discovery (robots.txt + /sitemap.xml)
#[arg(long)]
pub sitemap: bool,
/// Discover URLs from sitemap and print them (one per line; JSON array with --format json)
#[arg(long)]
pub map: bool,
// -- LLM options --
/// Extract structured JSON using LLM (pass a JSON schema string or @file)
#[arg(long)]
pub extract_json: Option<String>,
/// Extract using natural language prompt
#[arg(long)]
pub extract_prompt: Option<String>,
/// Summarize content using LLM (optional: number of sentences, default 3)
#[arg(long, num_args = 0..=1, default_missing_value = "3")]
pub summarize: Option<usize>,
/// Force a specific LLM provider (ollama, openai, anthropic)
#[arg(long, env = "WEBCLAW_LLM_PROVIDER")]
pub llm_provider: Option<String>,
/// Override the LLM model name
#[arg(long, env = "WEBCLAW_LLM_MODEL")]
pub llm_model: Option<String>,
/// Override the LLM base URL (Ollama, OpenAI-compatible, or Anthropic-compatible)
#[arg(long, env = "WEBCLAW_LLM_BASE_URL")]
pub llm_base_url: Option<String>,
// -- Cloud API options --
/// Webclaw Cloud API key for automatic fallback on bot-protected or JS-rendered sites
#[arg(long, env = "WEBCLAW_API_KEY")]
pub api_key: Option<String>,
/// Force all requests through the cloud API (skip local extraction)
#[arg(long)]
pub cloud: bool,
/// Run deep research on a topic via the cloud API. Requires --api-key.
/// Saves full result (report + sources + findings) to a JSON file.
#[arg(long)]
pub research: Option<String>,
/// Enable deep research mode (longer, more thorough report). Used with --research.
#[arg(long)]
pub deep: bool,
/// Output directory: save each page to a separate file instead of stdout.
/// Works with --crawl, batch (multiple URLs), and single URL mode.
/// Filenames are derived from URL paths (e.g. /docs/api -> docs/api.md).
#[arg(long)]
pub output_dir: Option<PathBuf>,
}
#[derive(Subcommand)]
pub enum Commands {
/// Per-URL extraction micro-benchmark: compares raw HTML vs. the
/// webclaw --format llm output on token count, bytes, and
/// extraction time. Uses an approximate tokenizer (see `--help`).
Bench {
/// URL to benchmark.
url: String,
/// Emit a single JSON line instead of the ASCII table.
/// Machine-readable shape stable across releases.
#[arg(long)]
json: bool,
/// Optional path to a facts.json (same schema as the repo's
/// benchmarks/facts.json) for a fidelity column.
#[arg(long)]
facts: Option<PathBuf>,
},
/// List all vertical extractors in the catalog.
///
/// Each entry has a stable `name` (usable with `webclaw vertical <name>`),
/// a human-friendly label, a one-line description, and the URL
/// patterns it claims. The same data is served by `/v1/extractors`
/// when running the REST API.
Extractors {
/// Emit JSON instead of a human-friendly table.
#[arg(long)]
json: bool,
},
/// Run a vertical extractor by name. Returns typed JSON with fields
/// specific to the target site (title, price, author, rating, etc.)
/// rather than generic markdown.
///
/// Use `webclaw extractors` to see the full list. Example:
/// `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/`.
Vertical {
/// Vertical name (e.g. `reddit`, `github_repo`, `trustpilot_reviews`).
name: String,
/// URL to extract.
url: String,
/// Emit compact JSON (single line). Default is pretty-printed.
#[arg(long)]
raw: bool,
},
}
#[derive(Clone, ValueEnum)]
pub enum OutputFormat {
Markdown,
Json,
Text,
Llm,
Html,
}
impl OutputFormat {
/// Map to the cloud API's `formats` string. Single source of truth for the
/// format names the REST API expects.
pub fn as_api_str(&self) -> &'static str {
match self {
OutputFormat::Markdown => "markdown",
OutputFormat::Json => "json",
OutputFormat::Text => "text",
OutputFormat::Llm => "llm",
OutputFormat::Html => "html",
}
}
}
#[derive(Clone, ValueEnum)]
pub enum Browser {
Chrome,
Firefox,
/// Safari iOS 26. Pair with a country-matched residential proxy for sites
/// that reject non-mobile profiles.
SafariIos,
Random,
}
#[derive(Clone, ValueEnum, Default)]
pub enum PdfModeArg {
/// Error if PDF has no extractable text (catches scanned PDFs)
#[default]
Auto,
/// Return whatever text is found, even if empty
Fast,
}
impl From<PdfModeArg> for PdfMode {
fn from(arg: PdfModeArg) -> Self {
match arg {
PdfModeArg::Auto => PdfMode::Auto,
PdfModeArg::Fast => PdfMode::Fast,
}
}
}
impl From<Browser> for BrowserProfile {
fn from(b: Browser) -> Self {
match b {
Browser::Chrome => BrowserProfile::Chrome,
Browser::Firefox => BrowserProfile::Firefox,
Browser::SafariIos => BrowserProfile::SafariIos,
Browser::Random => BrowserProfile::Random,
}
}
}

View file

@ -0,0 +1,823 @@
//! Input handling and fetching: config building, URL/cookie parsing, empty-page
//! detection, output-file writing, and the fetch+extract entry points (local,
//! remote, and cloud fallback).
use std::io::{self, Read as _};
use std::path::{Path, PathBuf};
use std::process;
use webclaw_core::{ExtractionOptions, ExtractionResult, extract_with_options};
use webclaw_fetch::{FetchClient, FetchConfig, FetchResult};
use crate::cli::Cli;
/// Known anti-bot challenge page titles (case-insensitive prefix match).
const ANTIBOT_TITLES: &[&str] = &[
"just a moment",
"attention required",
"access denied",
"checking your browser",
"please wait",
"one more step",
"verify you are human",
"bot verification",
"security check",
"ddos protection",
];
/// URL host/path fragments that indicate a GDPR/cookie consent redirect.
const CONSENT_URL_FRAGMENTS: &[&str] = &[
"://consent.",
"/consent?",
"/consent/",
"collectconsent",
"consentcheck",
"/cmp/",
"guce.advertising.com",
];
/// English consent-wall title prefixes. Many providers localize this page, so
/// this is a best-effort secondary signal. URL shape is the primary signal.
const CONSENT_TITLES: &[&str] = &[
"before you continue",
"your privacy choices",
"we value your privacy",
"we care about your privacy",
"cookie consent",
"consent required",
];
/// Detect why a page returned empty or near-empty content.
#[derive(Debug, PartialEq, Eq)]
pub enum EmptyReason {
/// Anti-bot challenge page (Cloudflare, Akamai, etc.)
Antibot,
/// GDPR/cookie consent redirect.
ConsentWall,
/// JS-only SPA that returns an empty shell without a browser
JsRequired,
/// Page has content.
None,
}
pub fn detect_empty(result: &ExtractionResult) -> EmptyReason {
// Consent walls can have a tiny body, so check before the content
// short-circuit.
if is_consent_wall(result) {
return EmptyReason::ConsentWall;
}
// Has real content. Nothing to warn about.
if result.metadata.word_count > 50 || !result.content.markdown.is_empty() {
return EmptyReason::None;
}
// Check for known anti-bot challenge titles
if let Some(ref title) = result.metadata.title {
let lower = title.to_lowercase();
if ANTIBOT_TITLES.iter().any(|t| lower.starts_with(t)) {
return EmptyReason::Antibot;
}
}
// Empty content with no title or a generic SPA shell = JS-only site
if result.metadata.word_count == 0 && result.content.links.is_empty() {
return EmptyReason::JsRequired;
}
EmptyReason::None
}
/// A consent wall is identified by either:
/// 1. The final URL pointing at a known consent host/path, or
/// 2. A consent-wall title prefix with a very small body.
fn is_consent_wall(result: &ExtractionResult) -> bool {
if let Some(ref url) = result.metadata.url {
let lower = url.to_ascii_lowercase();
if CONSENT_URL_FRAGMENTS
.iter()
.any(|fragment| lower.contains(fragment))
{
return true;
}
}
if result.metadata.word_count <= 50
&& let Some(ref title) = result.metadata.title
{
let lower = title.to_lowercase();
if CONSENT_TITLES
.iter()
.any(|prefix| lower.starts_with(prefix))
{
return true;
}
}
false
}
pub fn warn_empty(url: &str, reason: &EmptyReason) {
match reason {
EmptyReason::Antibot => eprintln!(
"\x1b[33mwarning:\x1b[0m Anti-bot protection detected on {url}\n\
This site requires CAPTCHA solving or browser rendering.\n\
Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing"
),
EmptyReason::ConsentWall => eprintln!(
"\x1b[33mwarning:\x1b[0m GDPR/cookie consent wall detected on {url}\n\
The site redirected to a consent page and returned no usable content.\n\
Try a different region via --proxy, or pass a pre-accepted consent cookie\n\
via --cookie / --cookie-file."
),
EmptyReason::JsRequired => eprintln!(
"\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\
This site requires JavaScript rendering (SPA).\n\
Use the webclaw Cloud API for JS rendering: https://webclaw.io/pricing"
),
EmptyReason::None => {}
}
}
/// Build FetchConfig from CLI flags.
///
/// `--proxy` sets a single static proxy (no rotation).
/// `--proxy-file` loads a pool of proxies and rotates per-request.
/// `--proxy` takes priority: if both are set, only the single proxy is used.
pub fn build_fetch_config(cli: &Cli) -> FetchConfig {
let (proxy, proxy_pool) = if cli.proxy.is_some() {
(cli.proxy.clone(), Vec::new())
} else if let Some(ref path) = cli.proxy_file {
match webclaw_fetch::parse_proxy_file(path) {
Ok(pool) => (None, pool),
Err(e) => {
eprintln!("warning: {e}");
(None, Vec::new())
}
}
} else if std::path::Path::new("proxies.txt").exists() {
// Auto-load proxies.txt from working directory if present
match webclaw_fetch::parse_proxy_file("proxies.txt") {
Ok(pool) if !pool.is_empty() => {
eprintln!("loaded {} proxies from proxies.txt", pool.len());
(None, pool)
}
_ => (None, Vec::new()),
}
} else {
(None, Vec::new())
};
let mut headers = std::collections::HashMap::from([(
"Accept-Language".to_string(),
"en-US,en;q=0.9".to_string(),
)]);
// Parse -H "Key: Value" flags
for h in &cli.headers {
if let Some((key, val)) = h.split_once(':') {
headers.insert(key.trim().to_string(), val.trim().to_string());
}
}
// --cookie shorthand
if let Some(ref cookie) = cli.cookie {
headers.insert("Cookie".to_string(), cookie.clone());
}
// --cookie-file: parse JSON array of {name, value, domain, ...}
if let Some(ref path) = cli.cookie_file {
match parse_cookie_file(path) {
Ok(cookie_str) => {
// Merge with existing cookies if --cookie was also provided
if let Some(existing) = headers.get("Cookie") {
headers.insert("Cookie".to_string(), format!("{existing}; {cookie_str}"));
} else {
headers.insert("Cookie".to_string(), cookie_str);
}
}
Err(e) => {
eprintln!("error: failed to parse cookie file: {e}");
process::exit(1);
}
}
}
FetchConfig {
browser: cli.browser.clone().into(),
proxy,
proxy_pool,
timeout: std::time::Duration::from_secs(cli.timeout),
pdf_mode: cli.pdf_mode.clone().into(),
headers,
..Default::default()
}
}
/// Parse a JSON cookie file (Chrome extension format) into a Cookie header string.
/// Supports: [{name, value, domain, path, secure, httpOnly, expirationDate, ...}]
fn parse_cookie_file(path: &str) -> Result<String, String> {
let content = std::fs::read_to_string(path).map_err(|e| format!("cannot read {path}: {e}"))?;
let cookies: Vec<serde_json::Value> =
serde_json::from_str(&content).map_err(|e| format!("invalid JSON: {e}"))?;
let pairs: Vec<String> = cookies
.iter()
.filter_map(|c| {
let name = c.get("name")?.as_str()?;
let value = c.get("value")?.as_str()?;
Some(format!("{name}={value}"))
})
.collect();
if pairs.is_empty() {
return Err("no cookies found in file".to_string());
}
Ok(pairs.join("; "))
}
pub fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
ExtractionOptions {
include_selectors: cli
.include
.as_deref()
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
.unwrap_or_default(),
exclude_selectors: cli
.exclude
.as_deref()
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
.unwrap_or_default(),
only_main_content: cli.only_main_content,
include_raw_html: cli.raw_html || matches!(cli.format, crate::cli::OutputFormat::Html),
}
}
/// Normalize a URL: prepend `https://` if no scheme is present.
pub fn normalize_url(url: &str) -> String {
let trimmed = url.trim();
if trimmed.contains("://") {
trimmed.to_string()
} else {
format!("https://{trimmed}")
}
}
/// Derive a filename from a URL for `--output-dir`.
///
/// Strips the scheme/host, maps the path to a filesystem path, and appends
/// an extension matching the output format.
pub fn url_to_filename(raw_url: &str, format: &crate::cli::OutputFormat) -> String {
use crate::cli::OutputFormat;
let ext = match format {
OutputFormat::Markdown | OutputFormat::Llm => "md",
OutputFormat::Json => "json",
OutputFormat::Text => "txt",
OutputFormat::Html => "html",
};
let parsed = url::Url::parse(raw_url);
let (host, path, query) = match &parsed {
Ok(u) => (
u.host_str().unwrap_or("unknown").to_string(),
u.path().to_string(),
u.query().map(String::from),
),
Err(_) => (String::new(), String::new(), None),
};
// Drop empty / "." / ".." path segments so a URL path like
// `/../../etc/passwd` can't climb out of the output directory.
let cleaned_path: String = path
.split('/')
.filter(|seg| !seg.is_empty() && *seg != "." && *seg != "..")
.collect::<Vec<_>>()
.join("/");
let mut stem = cleaned_path;
if stem.is_empty() {
// Use hostname for root URLs to avoid collisions in batch mode
let clean_host = host.strip_prefix("www.").unwrap_or(&host);
stem = format!("{}/index", clean_host.replace('.', "_"));
}
// Append query params so /p?id=123 doesn't collide with /p?id=456
if let Some(q) = query {
stem = format!("{stem}_{q}");
}
// Sanitize: keep alphanumeric, dash, underscore, dot, slash
let sanitized: String = stem
.chars()
.map(|c| {
if c.is_alphanumeric() || matches!(c, '-' | '_' | '.' | '/') {
c
} else {
'_'
}
})
.collect();
format!("{sanitized}.{ext}")
}
/// Reject a caller-supplied (CSV `url,filename`) name that could escape the
/// output directory: absolute paths, drive prefixes, root, or any `..`
/// component. Returns the validated relative path on success.
fn safe_relative_filename(filename: &str) -> Result<PathBuf, String> {
let candidate = Path::new(filename);
use std::path::Component;
for comp in candidate.components() {
match comp {
Component::Normal(_) | Component::CurDir => {}
Component::ParentDir => {
return Err(format!("refusing path with '..' component: {filename}"));
}
Component::RootDir | Component::Prefix(_) => {
return Err(format!("refusing absolute output path: {filename}"));
}
}
}
if candidate.as_os_str().is_empty() {
return Err("empty output filename".to_string());
}
Ok(candidate.to_path_buf())
}
/// Write extraction output to a file inside `dir`, creating parent dirs as needed.
///
/// `filename` may originate from an attacker-controlled `--urls-file`
/// (`url,filename` CSV). It is validated for traversal, and the canonical
/// destination directory is asserted to stay under the canonical output
/// directory before any write.
pub fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> {
let rel = safe_relative_filename(filename)?;
let dest = dir.join(&rel);
std::fs::create_dir_all(dir)
.map_err(|e| format!("failed to create directory {}: {e}", dir.display()))?;
let base = dir
.canonicalize()
.map_err(|e| format!("failed to resolve output dir {}: {e}", dir.display()))?;
if let Some(parent) = dest.parent() {
std::fs::create_dir_all(parent)
.map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?;
let canon_parent = parent
.canonicalize()
.map_err(|e| format!("failed to resolve {}: {e}", parent.display()))?;
if !canon_parent.starts_with(&base) {
return Err(format!(
"refusing to write outside output dir: {}",
dest.display()
));
}
}
std::fs::write(&dest, content)
.map_err(|e| format!("failed to write {}: {e}", dest.display()))?;
let word_count = content.split_whitespace().count();
eprintln!("Saved: {} ({word_count} words)", dest.display());
Ok(())
}
/// Collect all URLs from positional args + --urls-file, normalizing bare domains.
///
/// Returns `(url, optional_custom_filename)` pairs. Custom filenames come from
/// CSV-style lines in `--urls-file`: `url,filename`. Plain lines (no comma) get
/// `None` so the caller auto-generates the filename from the URL.
pub fn collect_urls(cli: &Cli) -> Result<Vec<(String, Option<String>)>, String> {
let mut entries: Vec<(String, Option<String>)> =
cli.urls.iter().map(|u| (normalize_url(u), None)).collect();
if let Some(ref path) = cli.urls_file {
let content =
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
for line in content.lines() {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
if let Some((url_part, name_part)) = trimmed.split_once(',') {
let name = name_part.trim();
let custom = if name.is_empty() {
None
} else {
Some(name.to_string())
};
entries.push((normalize_url(url_part.trim()), custom));
} else {
entries.push((normalize_url(trimmed), None));
}
}
}
Ok(entries)
}
/// Result that can be either a local extraction or a cloud API JSON response.
pub enum FetchOutput {
Local(Box<ExtractionResult>),
Cloud(serde_json::Value),
}
impl FetchOutput {
/// Get the local ExtractionResult, or try to parse it from the cloud response.
pub fn into_extraction(self) -> Result<ExtractionResult, String> {
match self {
FetchOutput::Local(r) => Ok(*r),
FetchOutput::Cloud(resp) => {
// Cloud response has an "extraction" field with the full ExtractionResult
resp.get("extraction")
.and_then(|v| serde_json::from_value(v.clone()).ok())
.or_else(|| serde_json::from_value(resp.clone()).ok())
.ok_or_else(|| "could not parse extraction from cloud response".to_string())
}
}
}
}
/// Fetch a URL and extract content, handling PDF detection automatically.
/// Falls back to cloud API when bot protection or JS rendering is detected.
pub async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
// Local sources: read and extract as HTML
if cli.stdin {
let mut buf = String::new();
io::stdin()
.read_to_string(&mut buf)
.map_err(|e| format!("failed to read stdin: {e}"))?;
let options = build_extraction_options(cli);
return extract_with_options(&buf, None, &options)
.map(|r| FetchOutput::Local(Box::new(r)))
.map_err(|e| format!("extraction error: {e}"));
}
if let Some(ref path) = cli.file {
let html =
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
let options = build_extraction_options(cli);
return extract_with_options(&html, None, &options)
.map(|r| FetchOutput::Local(Box::new(r)))
.map_err(|e| format!("extraction error: {e}"));
}
let raw_url = cli
.urls
.first()
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
let url = normalize_url(raw_url);
let url = url.as_str();
let cloud_client = webclaw_fetch::cloud::CloudClient::new(cli.api_key.as_deref());
// --cloud: skip local, go straight to cloud API
if cli.cloud {
let c =
cloud_client.ok_or("--cloud requires WEBCLAW_API_KEY (set via env or --api-key)")?;
let options = build_extraction_options(cli);
let resp = c
.scrape(
url,
&[cli.format.as_api_str()],
&options.include_selectors,
&options.exclude_selectors,
options.only_main_content,
)
.await?;
return Ok(FetchOutput::Cloud(resp));
}
// Normal path: try local first
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(cli);
let result = client
.fetch_and_extract_with_options(url, &options)
.await
.map_err(|e| format!("fetch error: {e}"))?;
// Check if we should fall back to cloud
let reason = detect_empty(&result);
if !matches!(reason, EmptyReason::None) {
if let Some(ref c) = cloud_client {
eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API...");
match c
.scrape(
url,
&[cli.format.as_api_str()],
&options.include_selectors,
&options.exclude_selectors,
options.only_main_content,
)
.await
{
Ok(resp) => return Ok(FetchOutput::Cloud(resp)),
Err(e) => {
eprintln!("\x1b[33mwarning:\x1b[0m cloud fallback failed: {e}");
// Fall through to return the local result with a warning
}
}
}
warn_empty(url, &reason);
}
Ok(FetchOutput::Local(Box::new(result)))
}
/// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction.
pub async fn fetch_html(cli: &Cli) -> Result<FetchResult, String> {
if cli.stdin {
let mut buf = String::new();
io::stdin()
.read_to_string(&mut buf)
.map_err(|e| format!("failed to read stdin: {e}"))?;
return Ok(FetchResult {
html: buf,
url: String::new(),
status: 200,
headers: Default::default(),
elapsed: Default::default(),
});
}
if let Some(ref path) = cli.file {
let html =
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
return Ok(FetchResult {
html,
url: String::new(),
status: 200,
headers: Default::default(),
elapsed: Default::default(),
});
}
let raw_url = cli
.urls
.first()
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
let url = normalize_url(raw_url);
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
client
.fetch(&url)
.await
.map_err(|e| format!("fetch error: {e}"))
}
/// Fetch external stylesheets referenced in HTML and inject them as `<style>` blocks.
/// This allows brand extraction to see colors/fonts from external CSS files.
pub async fn enrich_html_with_stylesheets(html: &str, base_url: &str) -> String {
let base = match url::Url::parse(base_url) {
Ok(u) => u,
Err(_) => return html.to_string(),
};
// Extract stylesheet hrefs from <link rel="stylesheet" href="...">
let re = regex::Regex::new(
r#"<link[^>]+rel=["']stylesheet["'][^>]+href=["']([^"']+)["']|<link[^>]+href=["']([^"']+)["'][^>]+rel=["']stylesheet["']"#
).unwrap();
let hrefs: Vec<String> = re
.captures_iter(html)
.filter_map(|cap| {
let href = cap.get(1).or(cap.get(2))?;
Some(
base.join(href.as_str())
.map(|u| u.to_string())
.unwrap_or_else(|_| href.as_str().to_string()),
)
})
.take(10)
.collect();
if hrefs.is_empty() {
return html.to_string();
}
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(5))
.redirect(reqwest::redirect::Policy::none())
.build()
.unwrap_or_default();
let mut extra_css = String::new();
for href in &hrefs {
if webclaw_fetch::url_security::validate_public_http_url(href)
.await
.is_err()
{
continue;
}
if let Ok(resp) = client.get(href).send().await
&& resp.status().is_success()
&& let Ok(body) = resp.text().await
&& !body.trim_start().starts_with("<!")
&& body.len() < 2_000_000
{
extra_css.push_str("\n<style>\n");
extra_css.push_str(&body);
extra_css.push_str("\n</style>\n");
}
}
if extra_css.is_empty() {
return html.to_string();
}
if let Some(pos) = html.to_lowercase().find("</head>") {
let mut enriched = String::with_capacity(html.len() + extra_css.len());
enriched.push_str(&html[..pos]);
enriched.push_str(&extra_css);
enriched.push_str(&html[pos..]);
enriched
} else {
format!("{extra_css}{html}")
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::cli::OutputFormat;
use webclaw_core::{Content, Metadata};
fn empty_result(title: Option<&str>, url: Option<&str>, markdown: &str) -> ExtractionResult {
let metadata = Metadata::default()
.with_title(title.map(str::to_string))
.with_url(url.map(str::to_string))
.with_word_count(markdown.split_whitespace().count());
let content = Content::default()
.with_markdown(markdown.to_string())
.with_plain_text(markdown.to_string());
ExtractionResult::new(metadata, content)
}
#[test]
fn detect_empty_identifies_consent_redirect_url() {
let result = empty_result(
Some("Yahoo"),
Some("https://guce.advertising.com/collectIdentifiers?sessionId=abc"),
"Continue",
);
assert_eq!(detect_empty(&result), EmptyReason::ConsentWall);
}
#[test]
fn detect_empty_identifies_short_consent_title() {
let result = empty_result(
Some("Before you continue"),
Some("https://www.google.com/"),
"Review privacy options",
);
assert_eq!(detect_empty(&result), EmptyReason::ConsentWall);
}
#[test]
fn detect_empty_does_not_flag_real_content_with_consent_words() {
let result = empty_result(
Some("Cookie consent patterns explained"),
Some("https://example.com/blog"),
"This article explains cookie consent patterns for product teams with enough real body text to be useful. It covers consent banners, privacy controls, analytics configuration, regional requirements, product tradeoffs, implementation details, testing flows, debugging notes, accessibility needs, and operational lessons from real teams shipping public websites across multiple markets. It also explains measurement, rollout planning, copy review, support workflows, design constraints, release notes, and how to keep privacy choices understandable for users.",
);
assert_eq!(detect_empty(&result), EmptyReason::None);
}
#[test]
fn url_to_filename_root() {
assert_eq!(
url_to_filename("https://example.com/", &OutputFormat::Markdown),
"example_com/index.md"
);
assert_eq!(
url_to_filename("https://example.com", &OutputFormat::Markdown),
"example_com/index.md"
);
}
#[test]
fn url_to_filename_path() {
assert_eq!(
url_to_filename("https://example.com/docs/api", &OutputFormat::Markdown),
"docs/api.md"
);
}
#[test]
fn url_to_filename_trailing_slash() {
assert_eq!(
url_to_filename("https://example.com/docs/api/", &OutputFormat::Markdown),
"docs/api.md"
);
}
#[test]
fn url_to_filename_nested_path() {
assert_eq!(
url_to_filename("https://example.com/blog/my-post", &OutputFormat::Markdown),
"blog/my-post.md"
);
}
#[test]
fn url_to_filename_query_params() {
assert_eq!(
url_to_filename("https://example.com/p?id=123", &OutputFormat::Markdown),
"p_id_123.md"
);
}
#[test]
fn url_to_filename_json_format() {
assert_eq!(
url_to_filename("https://example.com/docs/api", &OutputFormat::Json),
"docs/api.json"
);
}
#[test]
fn url_to_filename_text_format() {
assert_eq!(
url_to_filename("https://example.com/docs/api", &OutputFormat::Text),
"docs/api.txt"
);
}
#[test]
fn url_to_filename_llm_format() {
assert_eq!(
url_to_filename("https://example.com/docs/api", &OutputFormat::Llm),
"docs/api.md"
);
}
#[test]
fn url_to_filename_html_format() {
assert_eq!(
url_to_filename("https://example.com/docs/api", &OutputFormat::Html),
"docs/api.html"
);
}
#[test]
fn url_to_filename_special_chars() {
// Spaces and special chars get replaced with underscores
assert_eq!(
url_to_filename(
"https://example.com/path%20with%20spaces",
&OutputFormat::Markdown
),
"path_20with_20spaces.md"
);
}
#[test]
fn write_to_file_creates_dirs() {
let dir = std::env::temp_dir().join("webclaw_test_output_dir");
let _ = std::fs::remove_dir_all(&dir);
write_to_file(&dir, "nested/deep/file.md", "hello").unwrap();
let content = std::fs::read_to_string(dir.join("nested/deep/file.md")).unwrap();
assert_eq!(content, "hello");
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn url_to_filename_strips_traversal_segments() {
// `..` / `.` / empty path segments must not survive into the path.
let out = url_to_filename(
"https://example.com/../../etc/passwd",
&OutputFormat::Markdown,
);
assert!(!out.contains(".."), "traversal leaked: {out}");
assert_eq!(out, "etc/passwd.md");
let out2 = url_to_filename("https://example.com/a/./b//c", &OutputFormat::Json);
assert_eq!(out2, "a/b/c.json");
}
#[test]
fn safe_relative_filename_rejects_escapes() {
assert!(safe_relative_filename("../escape.md").is_err());
assert!(safe_relative_filename("a/../../b.md").is_err());
assert!(safe_relative_filename("/etc/passwd").is_err());
assert!(safe_relative_filename("").is_err());
// Normal nested relative names stay allowed.
assert!(safe_relative_filename("nested/deep/file.md").is_ok());
assert!(safe_relative_filename("./ok.md").is_ok());
}
#[test]
fn write_to_file_refuses_traversal_filename() {
let dir = std::env::temp_dir().join("webclaw_test_traversal_dir");
let _ = std::fs::remove_dir_all(&dir);
// CSV-supplied `url,filename` traversal attempt.
let err = write_to_file(&dir, "../../tmp/webclaw_pwned.md", "x").unwrap_err();
assert!(err.contains("refusing"), "unexpected error: {err}");
assert!(
!std::path::Path::new("/tmp/webclaw_pwned.md").exists(),
"traversal write escaped the output dir"
);
let _ = std::fs::remove_dir_all(&dir);
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,376 @@
//! Output formatting and rendering for every CLI mode.
//!
//! `render_one` is the single source of truth for turning one
//! `ExtractionResult` into a standalone document for a given format. The
//! `print_*`/`format_*` functions own iteration and separator logic and
//! delegate the per-page body to `render_one`.
use webclaw_core::{ContentDiff, ExtractionResult, Metadata, to_llm_text};
use webclaw_fetch::{BatchExtractResult, CrawlResult, PageResult, SitemapEntry};
use crate::cli::OutputFormat;
/// Get raw HTML from an extraction result, falling back to markdown if unavailable.
pub fn raw_html_or_markdown(result: &ExtractionResult) -> &str {
result
.content
.raw_html
.as_deref()
.unwrap_or(&result.content.markdown)
}
pub fn format_frontmatter(meta: &Metadata) -> String {
let mut lines = vec!["---".to_string()];
if let Some(title) = &meta.title {
lines.push(format!("title: \"{title}\""));
}
if let Some(author) = &meta.author {
lines.push(format!("author: \"{author}\""));
}
if let Some(date) = &meta.published_date {
lines.push(format!("date: \"{date}\""));
}
if let Some(url) = &meta.url {
lines.push(format!("source: \"{url}\""));
}
if meta.word_count > 0 {
lines.push(format!("word_count: {}", meta.word_count));
}
lines.push("---".to_string());
lines.push(String::new()); // blank line after frontmatter
lines.join("\n")
}
/// Render a single `ExtractionResult` into a standalone document string for the
/// given format. The Llm format derives its source URL from `metadata.url`.
///
/// This is the single per-page renderer behind `format_output` and
/// `print_output`. Callers own the iteration and separator framing.
pub fn render_one(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String {
match format {
OutputFormat::Markdown => {
let mut out = String::new();
if show_metadata {
out.push_str(&format_frontmatter(&result.metadata));
}
out.push_str(&result.content.markdown);
if !result.structured_data.is_empty() {
out.push_str("\n\n## Structured Data\n\n```json\n");
out.push_str(
&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(),
);
out.push_str("\n```");
}
out
}
OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
OutputFormat::Text => result.content.plain_text.clone(),
OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
OutputFormat::Html => raw_html_or_markdown(result).to_string(),
}
}
/// Format an `ExtractionResult` into a string for the given output format.
pub fn format_output(
result: &ExtractionResult,
format: &OutputFormat,
show_metadata: bool,
) -> String {
render_one(result, format, show_metadata)
}
pub fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) {
println!("{}", render_one(result, format, show_metadata));
}
/// Print cloud API response in the requested format.
pub fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
match format {
OutputFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(resp).expect("serialization failed")
);
}
OutputFormat::Markdown => {
// Cloud response has content.markdown
if let Some(md) = resp
.get("content")
.and_then(|c| c.get("markdown"))
.and_then(|m| m.as_str())
{
println!("{md}");
} else if let Some(md) = resp.get("markdown").and_then(|m| m.as_str()) {
println!("{md}");
} else {
println!(
"{}",
serde_json::to_string_pretty(resp).expect("serialization failed")
);
}
}
OutputFormat::Text => {
if let Some(txt) = resp
.get("content")
.and_then(|c| c.get("plain_text"))
.and_then(|t| t.as_str())
{
println!("{txt}");
} else {
// Fallback to markdown or raw JSON
print_cloud_output(resp, &OutputFormat::Markdown);
}
}
OutputFormat::Llm => {
if let Some(llm) = resp
.get("content")
.and_then(|c| c.get("llm_text"))
.and_then(|t| t.as_str())
{
println!("{llm}");
} else {
print_cloud_output(resp, &OutputFormat::Markdown);
}
}
OutputFormat::Html => {
if let Some(html) = resp
.get("content")
.and_then(|c| c.get("raw_html"))
.and_then(|h| h.as_str())
{
println!("{html}");
} else {
print_cloud_output(resp, &OutputFormat::Markdown);
}
}
}
}
pub fn print_diff_output(diff: &ContentDiff, format: &OutputFormat) {
match format {
OutputFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(diff).expect("serialization failed")
);
}
// For markdown/text/llm, show a human-readable summary
_ => {
println!("Status: {:?}", diff.status);
println!("Word count delta: {:+}", diff.word_count_delta);
if !diff.metadata_changes.is_empty() {
println!("\nMetadata changes:");
for change in &diff.metadata_changes {
println!(
" {}: {} -> {}",
change.field,
change.old.as_deref().unwrap_or("(none)"),
change.new.as_deref().unwrap_or("(none)"),
);
}
}
if !diff.links_added.is_empty() {
println!("\nLinks added:");
for link in &diff.links_added {
println!(" + {} ({})", link.href, link.text);
}
}
if !diff.links_removed.is_empty() {
println!("\nLinks removed:");
for link in &diff.links_removed {
println!(" - {} ({})", link.href, link.text);
}
}
if let Some(ref text_diff) = diff.text_diff {
println!("\n{text_diff}");
}
}
}
}
pub fn print_crawl_output(result: &CrawlResult, format: &OutputFormat, show_metadata: bool) {
match format {
OutputFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(result).expect("serialization failed")
);
}
OutputFormat::Markdown => {
for page in &result.pages {
let Some(ref extraction) = page.extraction else {
continue;
};
println!("---");
println!("# Page: {}\n", page.url);
if show_metadata {
print!("{}", format_frontmatter(&extraction.metadata));
}
println!("{}", extraction.content.markdown);
println!();
}
}
OutputFormat::Text => {
for page in &result.pages {
let Some(ref extraction) = page.extraction else {
continue;
};
println!("---");
println!("# Page: {}\n", page.url);
println!("{}", extraction.content.plain_text);
println!();
}
}
OutputFormat::Llm => {
for page in &result.pages {
let Some(ref extraction) = page.extraction else {
continue;
};
println!("---");
println!("{}", to_llm_text(extraction, Some(page.url.as_str())));
println!();
}
}
OutputFormat::Html => {
for page in &result.pages {
let Some(ref extraction) = page.extraction else {
continue;
};
println!("---");
println!("<!-- Page: {} -->\n", page.url);
println!("{}", raw_html_or_markdown(extraction));
println!();
}
}
}
}
pub fn print_batch_output(
results: &[BatchExtractResult],
format: &OutputFormat,
show_metadata: bool,
) {
match format {
OutputFormat::Json => {
// Build a JSON array of {url, result?, error?} objects
let entries: Vec<serde_json::Value> = results
.iter()
.map(|r| match &r.result {
Ok(extraction) => serde_json::json!({
"url": r.url,
"result": extraction,
}),
Err(e) => serde_json::json!({
"url": r.url,
"error": e.to_string(),
}),
})
.collect();
println!(
"{}",
serde_json::to_string_pretty(&entries).expect("serialization failed")
);
}
OutputFormat::Markdown => {
for r in results {
match &r.result {
Ok(extraction) => {
println!("---");
println!("# {}\n", r.url);
if show_metadata {
print!("{}", format_frontmatter(&extraction.metadata));
}
println!("{}", extraction.content.markdown);
println!();
}
Err(e) => {
eprintln!("error: {} -- {}", r.url, e);
}
}
}
}
OutputFormat::Text => {
for r in results {
match &r.result {
Ok(extraction) => {
println!("---");
println!("# {}\n", r.url);
println!("{}", extraction.content.plain_text);
println!();
}
Err(e) => {
eprintln!("error: {} -- {}", r.url, e);
}
}
}
}
OutputFormat::Llm => {
for r in results {
match &r.result {
Ok(extraction) => {
println!("---");
println!("{}", to_llm_text(extraction, Some(r.url.as_str())));
println!();
}
Err(e) => {
eprintln!("error: {} -- {}", r.url, e);
}
}
}
}
OutputFormat::Html => {
for r in results {
match &r.result {
Ok(extraction) => {
println!("---");
println!("<!-- {} -->\n", r.url);
println!("{}", raw_html_or_markdown(extraction));
println!();
}
Err(e) => {
eprintln!("error: {} -- {}", r.url, e);
}
}
}
}
}
}
pub fn print_map_output(entries: &[SitemapEntry], format: &OutputFormat) {
match format {
OutputFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(entries).expect("serialization failed")
);
}
_ => {
for entry in entries {
println!("{}", entry.url);
}
}
}
}
/// Format a streaming progress line for a completed page.
pub fn format_progress(page: &PageResult, index: usize, max_pages: usize) -> String {
let status = if page.error.is_some() { "ERR" } else { "OK " };
let timing = format!("{}ms", page.elapsed.as_millis());
let detail = if let Some(ref extraction) = page.extraction {
format!(", {} words", extraction.metadata.word_count)
} else if let Some(ref err) = page.error {
format!(" ({err})")
} else {
String::new()
};
format!(
"[{index}/{max_pages}] {status} {} ({timing}{detail})",
page.url
)
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,121 @@
//! Webhook delivery and `--on-change` command execution.
/// Spawn the `--on-change` command with `payload` on stdin.
///
/// Previously this passed the entire user-provided string to `sh -c`, which
/// made `--on-change 'notify "$URL"; rm -rf /'` a plausible disaster the
/// moment an untrusted config file or MCP-driven agent fed us a command.
/// The MCP surface specifically is prompt-injection-exposed: an LLM that
/// controls CLI args can escalate into arbitrary shell on the host.
///
/// We now parse the command with `shlex` (POSIX-ish tokenization with proper
/// quoting) and exec the program directly without an intermediate shell, so
/// metacharacters like `;`, `&&`, `|`, `$()`, and env expansion can't fire.
/// Users who genuinely need a pipeline can set the whole chain behind a
/// script they've written, or opt in per-call via `WEBCLAW_ALLOW_SHELL=1`
/// (documented escape hatch, noisy by design).
pub async fn spawn_on_change(cmd: &str, stdin_payload: &[u8]) {
eprintln!("[watch] Running: {cmd}");
let allow_shell = std::env::var("WEBCLAW_ALLOW_SHELL")
.map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
.unwrap_or(false);
let mut command = if allow_shell {
eprintln!("[watch] WEBCLAW_ALLOW_SHELL=1 — executing via sh -c (unsafe)");
let mut c = tokio::process::Command::new("sh");
c.arg("-c").arg(cmd);
c
} else {
let Some(argv) = shlex::split(cmd) else {
eprintln!("[watch] Failed to parse --on-change command (unbalanced quotes?)");
return;
};
let Some((program, args)) = argv.split_first() else {
eprintln!("[watch] --on-change command is empty");
return;
};
let mut c = tokio::process::Command::new(program);
c.args(args);
c
};
command.stdin(std::process::Stdio::piped());
match command.spawn() {
Ok(mut child) => {
if let Some(mut stdin) = child.stdin.take() {
use tokio::io::AsyncWriteExt;
let _ = stdin.write_all(stdin_payload).await;
}
}
Err(e) => eprintln!("[watch] Failed to run command: {e}"),
}
}
/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr.
/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly.
pub fn fire_webhook(url: &str, payload: &serde_json::Value) {
let url = url.to_string();
let is_discord = url.contains("discord.com/api/webhooks");
let is_slack = url.contains("hooks.slack.com");
let body = if is_discord {
let event = payload
.get("event")
.and_then(|v| v.as_str())
.unwrap_or("notification");
let details = serde_json::to_string_pretty(payload).unwrap_or_default();
serde_json::json!({
"embeds": [{
"title": format!("webclaw: {event}"),
"description": format!("```json\n{details}\n```"),
"color": 5814783
}]
})
.to_string()
} else if is_slack {
let event = payload
.get("event")
.and_then(|v| v.as_str())
.unwrap_or("notification");
let details = serde_json::to_string_pretty(payload).unwrap_or_default();
serde_json::json!({
"text": format!("*webclaw: {event}*\n```{details}```")
})
.to_string()
} else {
serde_json::to_string(payload).unwrap_or_default()
};
tokio::spawn(async move {
// SSRF guard: a webhook URL is user-supplied and otherwise bypasses
// the fetch-layer protections, so resolve + reject private/internal
// destinations before sending the payload.
if let Err(e) = webclaw_fetch::url_security::validate_public_http_url(&url).await {
eprintln!("[webhook] refusing unsafe URL: {e}");
return;
}
match reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
{
Ok(c) => match c
.post(&url)
.header("Content-Type", "application/json")
.body(body)
.send()
.await
{
Ok(resp) => {
eprintln!(
"[webhook] POST {} -> {}",
&url[..url.len().min(60)],
resp.status()
);
}
Err(e) => eprintln!("[webhook] POST failed: {e}"),
},
Err(e) => eprintln!("[webhook] client error: {e}"),
}
});
}