diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bf03cee..4d5625e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,11 @@ env: jobs: test: name: Test - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] steps: - uses: actions/checkout@v5 - uses: dtolnay/rust-toolchain@stable @@ -29,7 +33,16 @@ jobs: components: clippy, rustfmt - uses: Swatinem/rust-cache@v2 - run: cargo fmt --check --all - - run: cargo clippy --all -- -D warnings + - run: cargo clippy --all --all-targets -- -D warnings + + deny: + name: Supply chain + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + - uses: EmbarkStudios/cargo-deny-action@v2 + with: + command: check advisories bans licenses sources wasm: name: WASM diff --git a/Cargo.lock b/Cargo.lock index 78e7e77..942d841 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3331,11 +3331,13 @@ dependencies = [ "anyhow", "axum", "clap", + "http-body-util", "serde", "serde_json", "subtle", "thiserror", "tokio", + "tower", "tower-http", "tracing", "tracing-subscriber", diff --git a/Cargo.toml b/Cargo.toml index 124c620..1a80438 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,9 +5,31 @@ members = ["crates/*"] [workspace.package] version = "0.6.5" edition = "2024" +rust-version = "1.85" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" +# Hardened release profile: thin LTO + a single codegen unit enable +# cross-crate inlining on the hot extraction path and shrink the binaries, +# and stripping symbols trims the shipped artifact. We deliberately do NOT +# set `panic = "abort"`: webclaw-pdf relies on std::panic::catch_unwind to +# recover from panics inside the pdf-extract parser, and abort would turn +# those recoverable panics into hard process kills. +[profile.release] +lto = "thin" +codegen-units = 1 +strip = true + +# Conservative, high-value hardening lints applied workspace-wide. Crates +# opt in via `[lints] workspace = true`. Kept deliberately narrow so +# `clippy -D warnings` stays green — the goal is hardening, not a cleanup +# sweep that would break the build. +[workspace.lints.rust] +unsafe_op_in_unsafe_fn = "warn" + +[workspace.lints.clippy] +mem_forget = "warn" + [workspace.dependencies] webclaw-core = { path = "crates/webclaw-core" } webclaw-fetch = { path = "crates/webclaw-fetch" } diff --git a/crates/webclaw-cli/Cargo.toml b/crates/webclaw-cli/Cargo.toml index adce50f..a073ce2 100644 --- a/crates/webclaw-cli/Cargo.toml +++ b/crates/webclaw-cli/Cargo.toml @@ -3,8 +3,12 @@ name = "webclaw-cli" description = "CLI for extracting web content into LLM-optimized formats" version.workspace = true edition.workspace = true +rust-version.workspace = true license.workspace = true +[lints] +workspace = true + [[bin]] name = "webclaw" path = "src/main.rs" diff --git a/crates/webclaw-cli/examples/perf_corpus.rs b/crates/webclaw-cli/examples/perf_corpus.rs index aaa4c02..1198bf3 100644 --- a/crates/webclaw-cli/examples/perf_corpus.rs +++ b/crates/webclaw-cli/examples/perf_corpus.rs @@ -48,7 +48,10 @@ async fn main() { match mode.as_str() { "capture" => capture().await, "bench" => { - let iters: usize = std::env::args().nth(2).and_then(|s| s.parse().ok()).unwrap_or(60); + let iters: usize = std::env::args() + .nth(2) + .and_then(|s| s.parse().ok()) + .unwrap_or(60); bench(iters); } "snapshot" => { @@ -64,14 +67,21 @@ async fn main() { async fn capture() { fs::create_dir_all(CORPUS).unwrap(); - let config = FetchConfig { browser: BrowserProfile::Chrome, ..FetchConfig::default() }; + let config = FetchConfig { + browser: BrowserProfile::Chrome, + ..FetchConfig::default() + }; let client = FetchClient::new(config).expect("build client"); let mut ok = 0; for (i, u) in URLS.iter().enumerate() { let name = format!( "{:02}_{}.html", i + 1, - u.replace("https://", "").chars().map(|c| if c.is_alphanumeric() { c } else { '_' }).take(40).collect::() + u.replace("https://", "") + .chars() + .map(|c| if c.is_alphanumeric() { c } else { '_' }) + .take(40) + .collect::() ); match client.fetch(u).await { Ok(f) if f.html.len() > 1000 => { @@ -99,7 +109,9 @@ fn snapshot(label: &str) { let mut n = 0; for path in &files { let html = fs::read_to_string(path).unwrap_or_default(); - if html.is_empty() { continue; } + if html.is_empty() { + continue; + } let stem = path.file_stem().unwrap().to_string_lossy().to_string(); let url = format!("https://corpus/{stem}"); match extract(&html, Some(&url)) { @@ -117,7 +129,9 @@ fn snapshot(label: &str) { } fn percentile(sorted: &[u128], p: f64) -> u128 { - if sorted.is_empty() { return 0; } + if sorted.is_empty() { + return 0; + } let idx = ((sorted.len() as f64 - 1.0) * p).round() as usize; sorted[idx] } @@ -135,7 +149,10 @@ fn bench(iters: usize) { } println!("# perf_corpus bench docs={} iters={}", files.len(), iters); - println!("{:<42} {:>10} {:>10} {:>10} {:>10}", "doc(KB)", "extract_us", "llm_us", "p50_us", "p90_us"); + println!( + "{:<42} {:>10} {:>10} {:>10} {:>10}", + "doc(KB)", "extract_us", "llm_us", "p50_us", "p90_us" + ); let mut grand_extract = 0u128; let mut grand_llm = 0u128; @@ -143,8 +160,13 @@ fn bench(iters: usize) { for path in &files { let html = fs::read_to_string(path).unwrap_or_default(); - if html.is_empty() { continue; } - let url = format!("https://corpus/{}", path.file_name().unwrap().to_string_lossy()); + if html.is_empty() { + continue; + } + let url = format!( + "https://corpus/{}", + path.file_name().unwrap().to_string_lossy() + ); // warmup for _ in 0..5 { @@ -158,7 +180,10 @@ fn bench(iters: usize) { let mut total_times = Vec::with_capacity(iters); for _ in 0..iters { let t0 = Instant::now(); - let ex = match extract(&html, Some(&url)) { Ok(e) => e, Err(_) => continue }; + let ex = match extract(&html, Some(&url)) { + Ok(e) => e, + Err(_) => continue, + }; let t1 = Instant::now(); let txt = to_llm_text(&ex, Some(&url)); let t2 = Instant::now(); @@ -178,11 +203,24 @@ fn bench(iters: usize) { grand_llm += llm_p50; grand_total_p50 += tot_p50; - let label = format!("{} ({}KB)", path.file_stem().unwrap().to_string_lossy(), html.len() / 1024); - println!("{:<42} {:>10} {:>10} {:>10} {:>10}", label.chars().take(42).collect::(), ex_p50, llm_p50, tot_p50, tot_p90); + let label = format!( + "{} ({}KB)", + path.file_stem().unwrap().to_string_lossy(), + html.len() / 1024 + ); + println!( + "{:<42} {:>10} {:>10} {:>10} {:>10}", + label.chars().take(42).collect::(), + ex_p50, + llm_p50, + tot_p50, + tot_p90 + ); } println!("---"); - println!("CORPUS_PASS_P50_SUM_US extract={grand_extract} llm={grand_llm} total={grand_total_p50}"); + println!( + "CORPUS_PASS_P50_SUM_US extract={grand_extract} llm={grand_llm} total={grand_total_p50}" + ); println!("(lower is better; total = one full extract+llm pass over the whole corpus at p50)"); } diff --git a/crates/webclaw-cli/src/bench.rs b/crates/webclaw-cli/src/bench.rs index 3e45da4..4c9dbb1 100644 --- a/crates/webclaw-cli/src/bench.rs +++ b/crates/webclaw-cli/src/bench.rs @@ -198,7 +198,7 @@ fn fmt_int(n: usize) -> String { let bytes = s.as_bytes(); let mut out = String::with_capacity(s.len() + s.len() / 3); for (i, b) in bytes.iter().enumerate() { - if i > 0 && (bytes.len() - i).is_multiple_of(3) { + if i > 0 && (bytes.len() - i) % 3 == 0 { out.push(','); } out.push(*b as char); diff --git a/crates/webclaw-cli/src/cli.rs b/crates/webclaw-cli/src/cli.rs new file mode 100644 index 0000000..403e8cf --- /dev/null +++ b/crates/webclaw-cli/src/cli.rs @@ -0,0 +1,324 @@ +//! CLI argument definitions: clap structs/enums and their conversions. + +use std::path::PathBuf; + +use clap::{Parser, Subcommand, ValueEnum}; +use webclaw_fetch::BrowserProfile; +use webclaw_pdf::PdfMode; + +#[derive(Parser)] +#[command(name = "webclaw", about = "Extract web content for LLMs", version)] +pub struct Cli { + /// Optional subcommand. When omitted, the CLI falls back to the + /// traditional flag-based flow (URL + --format, --crawl, etc.). + /// Subcommands are used for flows that don't fit that model. + #[command(subcommand)] + pub command: Option, + + /// URLs to fetch (multiple allowed) + #[arg()] + pub urls: Vec, + + /// File with URLs (one per line) + #[arg(long)] + pub urls_file: Option, + + /// Output format (markdown, json, text, llm, html) + #[arg(short, long, default_value = "markdown")] + pub format: OutputFormat, + + /// Browser to impersonate + #[arg(short, long, default_value = "chrome")] + pub browser: Browser, + + /// Proxy URL (http://user:pass@host:port or socks5://host:port) + #[arg(short, long, env = "WEBCLAW_PROXY")] + pub proxy: Option, + + /// File with proxies (host:port:user:pass, one per line). Rotates per request. + #[arg(long, env = "WEBCLAW_PROXY_FILE")] + pub proxy_file: Option, + + /// Request timeout in seconds + #[arg(short, long, default_value = "30")] + pub timeout: u64, + + /// Extract from local HTML file instead of fetching + #[arg(long)] + pub file: Option, + + /// Read HTML from stdin + #[arg(long)] + pub stdin: bool, + + /// Include metadata in output (always included in JSON) + #[arg(long)] + pub metadata: bool, + + /// Output raw fetched HTML instead of extracting + #[arg(long)] + pub raw_html: bool, + + /// CSS selectors to include (comma-separated, e.g. "article,.content") + #[arg(long)] + pub include: Option, + + /// CSS selectors to exclude (comma-separated, e.g. "nav,.sidebar,footer") + #[arg(long)] + pub exclude: Option, + + /// Only extract main content (article/main element) + #[arg(long)] + pub only_main_content: bool, + + /// Custom headers (repeatable, e.g. -H "Cookie: foo=bar") + #[arg(short = 'H', long = "header")] + pub headers: Vec, + + /// Cookie string (shorthand for -H "Cookie: ...") + #[arg(long)] + pub cookie: Option, + + /// JSON cookie file (Chrome extension format: [{name, value, domain, ...}]) + #[arg(long)] + pub cookie_file: Option, + + /// Enable verbose logging + #[arg(short, long)] + pub verbose: bool, + + /// Compare against a previous JSON snapshot + #[arg(long)] + pub diff_with: Option, + + /// Watch a URL for changes. Checks at the specified interval and reports diffs. + #[arg(long)] + pub watch: bool, + + /// Watch interval in seconds [default: 300] + #[arg(long, default_value = "300")] + pub watch_interval: u64, + + /// Command to run when changes are detected (receives diff JSON on stdin) + #[arg(long)] + pub on_change: Option, + + /// Webhook URL: POST a JSON payload when an operation completes. + /// Works with crawl, batch, watch (on change), and single URL modes. + #[arg(long, env = "WEBCLAW_WEBHOOK_URL")] + pub webhook: Option, + + /// Extract brand identity (colors, fonts, logo) + #[arg(long)] + pub brand: bool, + + // -- PDF options -- + /// PDF extraction mode: auto (error on empty) or fast (return whatever text is found) + #[arg(long, default_value = "auto")] + pub pdf_mode: PdfModeArg, + + // -- Crawl options -- + /// Enable recursive crawling of same-domain links + #[arg(long)] + pub crawl: bool, + + /// Max crawl depth [default: 1] + #[arg(long, default_value = "1")] + pub depth: usize, + + /// Max pages to crawl [default: 20] + #[arg(long, default_value = "20")] + pub max_pages: usize, + + /// Max concurrent requests [default: 5] + #[arg(long, default_value = "5")] + pub concurrency: usize, + + /// Delay between requests in ms [default: 100] + #[arg(long, default_value = "100")] + pub delay: u64, + + /// Only crawl URLs matching this path prefix + #[arg(long)] + pub path_prefix: Option, + + /// Glob patterns for crawl URL paths to include (comma-separated, e.g. "/api/*,/guides/**") + #[arg(long)] + pub include_paths: Option, + + /// Glob patterns for crawl URL paths to exclude (comma-separated, e.g. "/changelog/*,/blog/*") + #[arg(long)] + pub exclude_paths: Option, + + /// Path to save/resume crawl state. On Ctrl+C: saves progress. On start: resumes if file exists. + #[arg(long)] + pub crawl_state: Option, + + /// Seed crawl frontier from sitemap discovery (robots.txt + /sitemap.xml) + #[arg(long)] + pub sitemap: bool, + + /// Discover URLs from sitemap and print them (one per line; JSON array with --format json) + #[arg(long)] + pub map: bool, + + // -- LLM options -- + /// Extract structured JSON using LLM (pass a JSON schema string or @file) + #[arg(long)] + pub extract_json: Option, + + /// Extract using natural language prompt + #[arg(long)] + pub extract_prompt: Option, + + /// Summarize content using LLM (optional: number of sentences, default 3) + #[arg(long, num_args = 0..=1, default_missing_value = "3")] + pub summarize: Option, + + /// Force a specific LLM provider (ollama, openai, anthropic) + #[arg(long, env = "WEBCLAW_LLM_PROVIDER")] + pub llm_provider: Option, + + /// Override the LLM model name + #[arg(long, env = "WEBCLAW_LLM_MODEL")] + pub llm_model: Option, + + /// Override the LLM base URL (Ollama, OpenAI-compatible, or Anthropic-compatible) + #[arg(long, env = "WEBCLAW_LLM_BASE_URL")] + pub llm_base_url: Option, + + // -- Cloud API options -- + /// Webclaw Cloud API key for automatic fallback on bot-protected or JS-rendered sites + #[arg(long, env = "WEBCLAW_API_KEY")] + pub api_key: Option, + + /// Force all requests through the cloud API (skip local extraction) + #[arg(long)] + pub cloud: bool, + + /// Run deep research on a topic via the cloud API. Requires --api-key. + /// Saves full result (report + sources + findings) to a JSON file. + #[arg(long)] + pub research: Option, + + /// Enable deep research mode (longer, more thorough report). Used with --research. + #[arg(long)] + pub deep: bool, + + /// Output directory: save each page to a separate file instead of stdout. + /// Works with --crawl, batch (multiple URLs), and single URL mode. + /// Filenames are derived from URL paths (e.g. /docs/api -> docs/api.md). + #[arg(long)] + pub output_dir: Option, +} + +#[derive(Subcommand)] +pub enum Commands { + /// Per-URL extraction micro-benchmark: compares raw HTML vs. the + /// webclaw --format llm output on token count, bytes, and + /// extraction time. Uses an approximate tokenizer (see `--help`). + Bench { + /// URL to benchmark. + url: String, + + /// Emit a single JSON line instead of the ASCII table. + /// Machine-readable shape stable across releases. + #[arg(long)] + json: bool, + + /// Optional path to a facts.json (same schema as the repo's + /// benchmarks/facts.json) for a fidelity column. + #[arg(long)] + facts: Option, + }, + + /// List all vertical extractors in the catalog. + /// + /// Each entry has a stable `name` (usable with `webclaw vertical `), + /// a human-friendly label, a one-line description, and the URL + /// patterns it claims. The same data is served by `/v1/extractors` + /// when running the REST API. + Extractors { + /// Emit JSON instead of a human-friendly table. + #[arg(long)] + json: bool, + }, + + /// Run a vertical extractor by name. Returns typed JSON with fields + /// specific to the target site (title, price, author, rating, etc.) + /// rather than generic markdown. + /// + /// Use `webclaw extractors` to see the full list. Example: + /// `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/`. + Vertical { + /// Vertical name (e.g. `reddit`, `github_repo`, `trustpilot_reviews`). + name: String, + /// URL to extract. + url: String, + /// Emit compact JSON (single line). Default is pretty-printed. + #[arg(long)] + raw: bool, + }, +} + +#[derive(Clone, ValueEnum)] +pub enum OutputFormat { + Markdown, + Json, + Text, + Llm, + Html, +} + +impl OutputFormat { + /// Map to the cloud API's `formats` string. Single source of truth for the + /// format names the REST API expects. + pub fn as_api_str(&self) -> &'static str { + match self { + OutputFormat::Markdown => "markdown", + OutputFormat::Json => "json", + OutputFormat::Text => "text", + OutputFormat::Llm => "llm", + OutputFormat::Html => "html", + } + } +} + +#[derive(Clone, ValueEnum)] +pub enum Browser { + Chrome, + Firefox, + /// Safari iOS 26. Pair with a country-matched residential proxy for sites + /// that reject non-mobile profiles. + SafariIos, + Random, +} + +#[derive(Clone, ValueEnum, Default)] +pub enum PdfModeArg { + /// Error if PDF has no extractable text (catches scanned PDFs) + #[default] + Auto, + /// Return whatever text is found, even if empty + Fast, +} + +impl From for PdfMode { + fn from(arg: PdfModeArg) -> Self { + match arg { + PdfModeArg::Auto => PdfMode::Auto, + PdfModeArg::Fast => PdfMode::Fast, + } + } +} + +impl From for BrowserProfile { + fn from(b: Browser) -> Self { + match b { + Browser::Chrome => BrowserProfile::Chrome, + Browser::Firefox => BrowserProfile::Firefox, + Browser::SafariIos => BrowserProfile::SafariIos, + Browser::Random => BrowserProfile::Random, + } + } +} diff --git a/crates/webclaw-cli/src/fetch.rs b/crates/webclaw-cli/src/fetch.rs new file mode 100644 index 0000000..df7b006 --- /dev/null +++ b/crates/webclaw-cli/src/fetch.rs @@ -0,0 +1,823 @@ +//! Input handling and fetching: config building, URL/cookie parsing, empty-page +//! detection, output-file writing, and the fetch+extract entry points (local, +//! remote, and cloud fallback). + +use std::io::{self, Read as _}; +use std::path::{Path, PathBuf}; +use std::process; + +use webclaw_core::{ExtractionOptions, ExtractionResult, extract_with_options}; +use webclaw_fetch::{FetchClient, FetchConfig, FetchResult}; + +use crate::cli::Cli; + +/// Known anti-bot challenge page titles (case-insensitive prefix match). +const ANTIBOT_TITLES: &[&str] = &[ + "just a moment", + "attention required", + "access denied", + "checking your browser", + "please wait", + "one more step", + "verify you are human", + "bot verification", + "security check", + "ddos protection", +]; + +/// URL host/path fragments that indicate a GDPR/cookie consent redirect. +const CONSENT_URL_FRAGMENTS: &[&str] = &[ + "://consent.", + "/consent?", + "/consent/", + "collectconsent", + "consentcheck", + "/cmp/", + "guce.advertising.com", +]; + +/// English consent-wall title prefixes. Many providers localize this page, so +/// this is a best-effort secondary signal. URL shape is the primary signal. +const CONSENT_TITLES: &[&str] = &[ + "before you continue", + "your privacy choices", + "we value your privacy", + "we care about your privacy", + "cookie consent", + "consent required", +]; + +/// Detect why a page returned empty or near-empty content. +#[derive(Debug, PartialEq, Eq)] +pub enum EmptyReason { + /// Anti-bot challenge page (Cloudflare, Akamai, etc.) + Antibot, + /// GDPR/cookie consent redirect. + ConsentWall, + /// JS-only SPA that returns an empty shell without a browser + JsRequired, + /// Page has content. + None, +} + +pub fn detect_empty(result: &ExtractionResult) -> EmptyReason { + // Consent walls can have a tiny body, so check before the content + // short-circuit. + if is_consent_wall(result) { + return EmptyReason::ConsentWall; + } + + // Has real content. Nothing to warn about. + if result.metadata.word_count > 50 || !result.content.markdown.is_empty() { + return EmptyReason::None; + } + + // Check for known anti-bot challenge titles + if let Some(ref title) = result.metadata.title { + let lower = title.to_lowercase(); + if ANTIBOT_TITLES.iter().any(|t| lower.starts_with(t)) { + return EmptyReason::Antibot; + } + } + + // Empty content with no title or a generic SPA shell = JS-only site + if result.metadata.word_count == 0 && result.content.links.is_empty() { + return EmptyReason::JsRequired; + } + + EmptyReason::None +} + +/// A consent wall is identified by either: +/// 1. The final URL pointing at a known consent host/path, or +/// 2. A consent-wall title prefix with a very small body. +fn is_consent_wall(result: &ExtractionResult) -> bool { + if let Some(ref url) = result.metadata.url { + let lower = url.to_ascii_lowercase(); + if CONSENT_URL_FRAGMENTS + .iter() + .any(|fragment| lower.contains(fragment)) + { + return true; + } + } + + if result.metadata.word_count <= 50 + && let Some(ref title) = result.metadata.title + { + let lower = title.to_lowercase(); + if CONSENT_TITLES + .iter() + .any(|prefix| lower.starts_with(prefix)) + { + return true; + } + } + + false +} + +pub fn warn_empty(url: &str, reason: &EmptyReason) { + match reason { + EmptyReason::Antibot => eprintln!( + "\x1b[33mwarning:\x1b[0m Anti-bot protection detected on {url}\n\ + This site requires CAPTCHA solving or browser rendering.\n\ + Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing" + ), + EmptyReason::ConsentWall => eprintln!( + "\x1b[33mwarning:\x1b[0m GDPR/cookie consent wall detected on {url}\n\ + The site redirected to a consent page and returned no usable content.\n\ + Try a different region via --proxy, or pass a pre-accepted consent cookie\n\ + via --cookie / --cookie-file." + ), + EmptyReason::JsRequired => eprintln!( + "\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\ + This site requires JavaScript rendering (SPA).\n\ + Use the webclaw Cloud API for JS rendering: https://webclaw.io/pricing" + ), + EmptyReason::None => {} + } +} + +/// Build FetchConfig from CLI flags. +/// +/// `--proxy` sets a single static proxy (no rotation). +/// `--proxy-file` loads a pool of proxies and rotates per-request. +/// `--proxy` takes priority: if both are set, only the single proxy is used. +pub fn build_fetch_config(cli: &Cli) -> FetchConfig { + let (proxy, proxy_pool) = if cli.proxy.is_some() { + (cli.proxy.clone(), Vec::new()) + } else if let Some(ref path) = cli.proxy_file { + match webclaw_fetch::parse_proxy_file(path) { + Ok(pool) => (None, pool), + Err(e) => { + eprintln!("warning: {e}"); + (None, Vec::new()) + } + } + } else if std::path::Path::new("proxies.txt").exists() { + // Auto-load proxies.txt from working directory if present + match webclaw_fetch::parse_proxy_file("proxies.txt") { + Ok(pool) if !pool.is_empty() => { + eprintln!("loaded {} proxies from proxies.txt", pool.len()); + (None, pool) + } + _ => (None, Vec::new()), + } + } else { + (None, Vec::new()) + }; + + let mut headers = std::collections::HashMap::from([( + "Accept-Language".to_string(), + "en-US,en;q=0.9".to_string(), + )]); + + // Parse -H "Key: Value" flags + for h in &cli.headers { + if let Some((key, val)) = h.split_once(':') { + headers.insert(key.trim().to_string(), val.trim().to_string()); + } + } + + // --cookie shorthand + if let Some(ref cookie) = cli.cookie { + headers.insert("Cookie".to_string(), cookie.clone()); + } + + // --cookie-file: parse JSON array of {name, value, domain, ...} + if let Some(ref path) = cli.cookie_file { + match parse_cookie_file(path) { + Ok(cookie_str) => { + // Merge with existing cookies if --cookie was also provided + if let Some(existing) = headers.get("Cookie") { + headers.insert("Cookie".to_string(), format!("{existing}; {cookie_str}")); + } else { + headers.insert("Cookie".to_string(), cookie_str); + } + } + Err(e) => { + eprintln!("error: failed to parse cookie file: {e}"); + process::exit(1); + } + } + } + + FetchConfig { + browser: cli.browser.clone().into(), + proxy, + proxy_pool, + timeout: std::time::Duration::from_secs(cli.timeout), + pdf_mode: cli.pdf_mode.clone().into(), + headers, + ..Default::default() + } +} + +/// Parse a JSON cookie file (Chrome extension format) into a Cookie header string. +/// Supports: [{name, value, domain, path, secure, httpOnly, expirationDate, ...}] +fn parse_cookie_file(path: &str) -> Result { + let content = std::fs::read_to_string(path).map_err(|e| format!("cannot read {path}: {e}"))?; + let cookies: Vec = + serde_json::from_str(&content).map_err(|e| format!("invalid JSON: {e}"))?; + + let pairs: Vec = cookies + .iter() + .filter_map(|c| { + let name = c.get("name")?.as_str()?; + let value = c.get("value")?.as_str()?; + Some(format!("{name}={value}")) + }) + .collect(); + + if pairs.is_empty() { + return Err("no cookies found in file".to_string()); + } + + Ok(pairs.join("; ")) +} + +pub fn build_extraction_options(cli: &Cli) -> ExtractionOptions { + ExtractionOptions { + include_selectors: cli + .include + .as_deref() + .map(|s| s.split(',').map(|s| s.trim().to_string()).collect()) + .unwrap_or_default(), + exclude_selectors: cli + .exclude + .as_deref() + .map(|s| s.split(',').map(|s| s.trim().to_string()).collect()) + .unwrap_or_default(), + only_main_content: cli.only_main_content, + include_raw_html: cli.raw_html || matches!(cli.format, crate::cli::OutputFormat::Html), + } +} + +/// Normalize a URL: prepend `https://` if no scheme is present. +pub fn normalize_url(url: &str) -> String { + let trimmed = url.trim(); + if trimmed.contains("://") { + trimmed.to_string() + } else { + format!("https://{trimmed}") + } +} + +/// Derive a filename from a URL for `--output-dir`. +/// +/// Strips the scheme/host, maps the path to a filesystem path, and appends +/// an extension matching the output format. +pub fn url_to_filename(raw_url: &str, format: &crate::cli::OutputFormat) -> String { + use crate::cli::OutputFormat; + let ext = match format { + OutputFormat::Markdown | OutputFormat::Llm => "md", + OutputFormat::Json => "json", + OutputFormat::Text => "txt", + OutputFormat::Html => "html", + }; + + let parsed = url::Url::parse(raw_url); + let (host, path, query) = match &parsed { + Ok(u) => ( + u.host_str().unwrap_or("unknown").to_string(), + u.path().to_string(), + u.query().map(String::from), + ), + Err(_) => (String::new(), String::new(), None), + }; + + // Drop empty / "." / ".." path segments so a URL path like + // `/../../etc/passwd` can't climb out of the output directory. + let cleaned_path: String = path + .split('/') + .filter(|seg| !seg.is_empty() && *seg != "." && *seg != "..") + .collect::>() + .join("/"); + + let mut stem = cleaned_path; + if stem.is_empty() { + // Use hostname for root URLs to avoid collisions in batch mode + let clean_host = host.strip_prefix("www.").unwrap_or(&host); + stem = format!("{}/index", clean_host.replace('.', "_")); + } + + // Append query params so /p?id=123 doesn't collide with /p?id=456 + if let Some(q) = query { + stem = format!("{stem}_{q}"); + } + + // Sanitize: keep alphanumeric, dash, underscore, dot, slash + let sanitized: String = stem + .chars() + .map(|c| { + if c.is_alphanumeric() || matches!(c, '-' | '_' | '.' | '/') { + c + } else { + '_' + } + }) + .collect(); + + format!("{sanitized}.{ext}") +} + +/// Reject a caller-supplied (CSV `url,filename`) name that could escape the +/// output directory: absolute paths, drive prefixes, root, or any `..` +/// component. Returns the validated relative path on success. +fn safe_relative_filename(filename: &str) -> Result { + let candidate = Path::new(filename); + use std::path::Component; + for comp in candidate.components() { + match comp { + Component::Normal(_) | Component::CurDir => {} + Component::ParentDir => { + return Err(format!("refusing path with '..' component: {filename}")); + } + Component::RootDir | Component::Prefix(_) => { + return Err(format!("refusing absolute output path: {filename}")); + } + } + } + if candidate.as_os_str().is_empty() { + return Err("empty output filename".to_string()); + } + Ok(candidate.to_path_buf()) +} + +/// Write extraction output to a file inside `dir`, creating parent dirs as needed. +/// +/// `filename` may originate from an attacker-controlled `--urls-file` +/// (`url,filename` CSV). It is validated for traversal, and the canonical +/// destination directory is asserted to stay under the canonical output +/// directory before any write. +pub fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> { + let rel = safe_relative_filename(filename)?; + let dest = dir.join(&rel); + + std::fs::create_dir_all(dir) + .map_err(|e| format!("failed to create directory {}: {e}", dir.display()))?; + let base = dir + .canonicalize() + .map_err(|e| format!("failed to resolve output dir {}: {e}", dir.display()))?; + + if let Some(parent) = dest.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?; + let canon_parent = parent + .canonicalize() + .map_err(|e| format!("failed to resolve {}: {e}", parent.display()))?; + if !canon_parent.starts_with(&base) { + return Err(format!( + "refusing to write outside output dir: {}", + dest.display() + )); + } + } + + std::fs::write(&dest, content) + .map_err(|e| format!("failed to write {}: {e}", dest.display()))?; + let word_count = content.split_whitespace().count(); + eprintln!("Saved: {} ({word_count} words)", dest.display()); + Ok(()) +} + +/// Collect all URLs from positional args + --urls-file, normalizing bare domains. +/// +/// Returns `(url, optional_custom_filename)` pairs. Custom filenames come from +/// CSV-style lines in `--urls-file`: `url,filename`. Plain lines (no comma) get +/// `None` so the caller auto-generates the filename from the URL. +pub fn collect_urls(cli: &Cli) -> Result)>, String> { + let mut entries: Vec<(String, Option)> = + cli.urls.iter().map(|u| (normalize_url(u), None)).collect(); + + if let Some(ref path) = cli.urls_file { + let content = + std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?; + for line in content.lines() { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with('#') { + continue; + } + if let Some((url_part, name_part)) = trimmed.split_once(',') { + let name = name_part.trim(); + let custom = if name.is_empty() { + None + } else { + Some(name.to_string()) + }; + entries.push((normalize_url(url_part.trim()), custom)); + } else { + entries.push((normalize_url(trimmed), None)); + } + } + } + + Ok(entries) +} + +/// Result that can be either a local extraction or a cloud API JSON response. +pub enum FetchOutput { + Local(Box), + Cloud(serde_json::Value), +} + +impl FetchOutput { + /// Get the local ExtractionResult, or try to parse it from the cloud response. + pub fn into_extraction(self) -> Result { + match self { + FetchOutput::Local(r) => Ok(*r), + FetchOutput::Cloud(resp) => { + // Cloud response has an "extraction" field with the full ExtractionResult + resp.get("extraction") + .and_then(|v| serde_json::from_value(v.clone()).ok()) + .or_else(|| serde_json::from_value(resp.clone()).ok()) + .ok_or_else(|| "could not parse extraction from cloud response".to_string()) + } + } + } +} + +/// Fetch a URL and extract content, handling PDF detection automatically. +/// Falls back to cloud API when bot protection or JS rendering is detected. +pub async fn fetch_and_extract(cli: &Cli) -> Result { + // Local sources: read and extract as HTML + if cli.stdin { + let mut buf = String::new(); + io::stdin() + .read_to_string(&mut buf) + .map_err(|e| format!("failed to read stdin: {e}"))?; + let options = build_extraction_options(cli); + return extract_with_options(&buf, None, &options) + .map(|r| FetchOutput::Local(Box::new(r))) + .map_err(|e| format!("extraction error: {e}")); + } + + if let Some(ref path) = cli.file { + let html = + std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?; + let options = build_extraction_options(cli); + return extract_with_options(&html, None, &options) + .map(|r| FetchOutput::Local(Box::new(r))) + .map_err(|e| format!("extraction error: {e}")); + } + + let raw_url = cli + .urls + .first() + .ok_or("no input provided -- pass a URL, --file, or --stdin")?; + let url = normalize_url(raw_url); + let url = url.as_str(); + + let cloud_client = webclaw_fetch::cloud::CloudClient::new(cli.api_key.as_deref()); + + // --cloud: skip local, go straight to cloud API + if cli.cloud { + let c = + cloud_client.ok_or("--cloud requires WEBCLAW_API_KEY (set via env or --api-key)")?; + let options = build_extraction_options(cli); + let resp = c + .scrape( + url, + &[cli.format.as_api_str()], + &options.include_selectors, + &options.exclude_selectors, + options.only_main_content, + ) + .await?; + return Ok(FetchOutput::Cloud(resp)); + } + + // Normal path: try local first + let client = + FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; + let options = build_extraction_options(cli); + let result = client + .fetch_and_extract_with_options(url, &options) + .await + .map_err(|e| format!("fetch error: {e}"))?; + + // Check if we should fall back to cloud + let reason = detect_empty(&result); + if !matches!(reason, EmptyReason::None) { + if let Some(ref c) = cloud_client { + eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API..."); + match c + .scrape( + url, + &[cli.format.as_api_str()], + &options.include_selectors, + &options.exclude_selectors, + options.only_main_content, + ) + .await + { + Ok(resp) => return Ok(FetchOutput::Cloud(resp)), + Err(e) => { + eprintln!("\x1b[33mwarning:\x1b[0m cloud fallback failed: {e}"); + // Fall through to return the local result with a warning + } + } + } + warn_empty(url, &reason); + } + + Ok(FetchOutput::Local(Box::new(result))) +} + +/// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction. +pub async fn fetch_html(cli: &Cli) -> Result { + if cli.stdin { + let mut buf = String::new(); + io::stdin() + .read_to_string(&mut buf) + .map_err(|e| format!("failed to read stdin: {e}"))?; + return Ok(FetchResult { + html: buf, + url: String::new(), + status: 200, + headers: Default::default(), + elapsed: Default::default(), + }); + } + + if let Some(ref path) = cli.file { + let html = + std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?; + return Ok(FetchResult { + html, + url: String::new(), + status: 200, + headers: Default::default(), + elapsed: Default::default(), + }); + } + + let raw_url = cli + .urls + .first() + .ok_or("no input provided -- pass a URL, --file, or --stdin")?; + let url = normalize_url(raw_url); + + let client = + FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; + client + .fetch(&url) + .await + .map_err(|e| format!("fetch error: {e}")) +} + +/// Fetch external stylesheets referenced in HTML and inject them as `\n"); + } + } + + if extra_css.is_empty() { + return html.to_string(); + } + + if let Some(pos) = html.to_lowercase().find("") { + let mut enriched = String::with_capacity(html.len() + extra_css.len()); + enriched.push_str(&html[..pos]); + enriched.push_str(&extra_css); + enriched.push_str(&html[pos..]); + enriched + } else { + format!("{extra_css}{html}") + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cli::OutputFormat; + use webclaw_core::{Content, Metadata}; + + fn empty_result(title: Option<&str>, url: Option<&str>, markdown: &str) -> ExtractionResult { + let metadata = Metadata::default() + .with_title(title.map(str::to_string)) + .with_url(url.map(str::to_string)) + .with_word_count(markdown.split_whitespace().count()); + let content = Content::default() + .with_markdown(markdown.to_string()) + .with_plain_text(markdown.to_string()); + ExtractionResult::new(metadata, content) + } + + #[test] + fn detect_empty_identifies_consent_redirect_url() { + let result = empty_result( + Some("Yahoo"), + Some("https://guce.advertising.com/collectIdentifiers?sessionId=abc"), + "Continue", + ); + assert_eq!(detect_empty(&result), EmptyReason::ConsentWall); + } + + #[test] + fn detect_empty_identifies_short_consent_title() { + let result = empty_result( + Some("Before you continue"), + Some("https://www.google.com/"), + "Review privacy options", + ); + assert_eq!(detect_empty(&result), EmptyReason::ConsentWall); + } + + #[test] + fn detect_empty_does_not_flag_real_content_with_consent_words() { + let result = empty_result( + Some("Cookie consent patterns explained"), + Some("https://example.com/blog"), + "This article explains cookie consent patterns for product teams with enough real body text to be useful. It covers consent banners, privacy controls, analytics configuration, regional requirements, product tradeoffs, implementation details, testing flows, debugging notes, accessibility needs, and operational lessons from real teams shipping public websites across multiple markets. It also explains measurement, rollout planning, copy review, support workflows, design constraints, release notes, and how to keep privacy choices understandable for users.", + ); + assert_eq!(detect_empty(&result), EmptyReason::None); + } + + #[test] + fn url_to_filename_root() { + assert_eq!( + url_to_filename("https://example.com/", &OutputFormat::Markdown), + "example_com/index.md" + ); + assert_eq!( + url_to_filename("https://example.com", &OutputFormat::Markdown), + "example_com/index.md" + ); + } + + #[test] + fn url_to_filename_path() { + assert_eq!( + url_to_filename("https://example.com/docs/api", &OutputFormat::Markdown), + "docs/api.md" + ); + } + + #[test] + fn url_to_filename_trailing_slash() { + assert_eq!( + url_to_filename("https://example.com/docs/api/", &OutputFormat::Markdown), + "docs/api.md" + ); + } + + #[test] + fn url_to_filename_nested_path() { + assert_eq!( + url_to_filename("https://example.com/blog/my-post", &OutputFormat::Markdown), + "blog/my-post.md" + ); + } + + #[test] + fn url_to_filename_query_params() { + assert_eq!( + url_to_filename("https://example.com/p?id=123", &OutputFormat::Markdown), + "p_id_123.md" + ); + } + + #[test] + fn url_to_filename_json_format() { + assert_eq!( + url_to_filename("https://example.com/docs/api", &OutputFormat::Json), + "docs/api.json" + ); + } + + #[test] + fn url_to_filename_text_format() { + assert_eq!( + url_to_filename("https://example.com/docs/api", &OutputFormat::Text), + "docs/api.txt" + ); + } + + #[test] + fn url_to_filename_llm_format() { + assert_eq!( + url_to_filename("https://example.com/docs/api", &OutputFormat::Llm), + "docs/api.md" + ); + } + + #[test] + fn url_to_filename_html_format() { + assert_eq!( + url_to_filename("https://example.com/docs/api", &OutputFormat::Html), + "docs/api.html" + ); + } + + #[test] + fn url_to_filename_special_chars() { + // Spaces and special chars get replaced with underscores + assert_eq!( + url_to_filename( + "https://example.com/path%20with%20spaces", + &OutputFormat::Markdown + ), + "path_20with_20spaces.md" + ); + } + + #[test] + fn write_to_file_creates_dirs() { + let dir = std::env::temp_dir().join("webclaw_test_output_dir"); + let _ = std::fs::remove_dir_all(&dir); + write_to_file(&dir, "nested/deep/file.md", "hello").unwrap(); + let content = std::fs::read_to_string(dir.join("nested/deep/file.md")).unwrap(); + assert_eq!(content, "hello"); + let _ = std::fs::remove_dir_all(&dir); + } + + #[test] + fn url_to_filename_strips_traversal_segments() { + // `..` / `.` / empty path segments must not survive into the path. + let out = url_to_filename( + "https://example.com/../../etc/passwd", + &OutputFormat::Markdown, + ); + assert!(!out.contains(".."), "traversal leaked: {out}"); + assert_eq!(out, "etc/passwd.md"); + let out2 = url_to_filename("https://example.com/a/./b//c", &OutputFormat::Json); + assert_eq!(out2, "a/b/c.json"); + } + + #[test] + fn safe_relative_filename_rejects_escapes() { + assert!(safe_relative_filename("../escape.md").is_err()); + assert!(safe_relative_filename("a/../../b.md").is_err()); + assert!(safe_relative_filename("/etc/passwd").is_err()); + assert!(safe_relative_filename("").is_err()); + // Normal nested relative names stay allowed. + assert!(safe_relative_filename("nested/deep/file.md").is_ok()); + assert!(safe_relative_filename("./ok.md").is_ok()); + } + + #[test] + fn write_to_file_refuses_traversal_filename() { + let dir = std::env::temp_dir().join("webclaw_test_traversal_dir"); + let _ = std::fs::remove_dir_all(&dir); + // CSV-supplied `url,filename` traversal attempt. + let err = write_to_file(&dir, "../../tmp/webclaw_pwned.md", "x").unwrap_err(); + assert!(err.contains("refusing"), "unexpected error: {err}"); + assert!( + !std::path::Path::new("/tmp/webclaw_pwned.md").exists(), + "traversal write escaped the output dir" + ); + let _ = std::fs::remove_dir_all(&dir); + } +} diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 1348824..39088be 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -1,456 +1,28 @@ /// CLI entry point -- wires webclaw-core and webclaw-fetch into a single command. -/// All extraction and fetching logic lives in sibling crates; this is pure plumbing. +/// All extraction and fetching logic lives in sibling crates and modules; this +/// file is the argument parser plus dispatch. mod bench; +mod cli; +mod fetch; +mod output; +mod run; +mod webhook; -use std::io::{self, Read as _}; -use std::path::{Path, PathBuf}; use std::process; -use std::sync::Arc; -use std::sync::atomic::{AtomicBool, Ordering}; -use clap::{Parser, Subcommand, ValueEnum}; +use clap::Parser; use tracing_subscriber::EnvFilter; -use webclaw_core::{ - ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options, - to_llm_text, + +use cli::{Cli, Commands}; +use fetch::{ + FetchOutput, collect_urls, fetch_and_extract, fetch_html, normalize_url, url_to_filename, + write_to_file, }; -use webclaw_fetch::{ - BatchExtractResult, BrowserProfile, CrawlConfig, CrawlResult, Crawler, FetchClient, - FetchConfig, FetchResult, PageResult, SitemapEntry, +use output::{format_output, print_cloud_output, print_output}; +use run::{ + has_llm_flags, run_batch, run_batch_llm, run_brand, run_crawl, run_diff, run_llm, run_map, + run_research, run_watch, }; -use webclaw_llm::LlmProvider; -use webclaw_pdf::PdfMode; - -/// Known anti-bot challenge page titles (case-insensitive prefix match). -const ANTIBOT_TITLES: &[&str] = &[ - "just a moment", - "attention required", - "access denied", - "checking your browser", - "please wait", - "one more step", - "verify you are human", - "bot verification", - "security check", - "ddos protection", -]; - -/// URL host/path fragments that indicate a GDPR/cookie consent redirect. -const CONSENT_URL_FRAGMENTS: &[&str] = &[ - "://consent.", - "/consent?", - "/consent/", - "collectconsent", - "consentcheck", - "/cmp/", - "guce.advertising.com", -]; - -/// English consent-wall title prefixes. Many providers localize this page, so -/// this is a best-effort secondary signal. URL shape is the primary signal. -const CONSENT_TITLES: &[&str] = &[ - "before you continue", - "your privacy choices", - "we value your privacy", - "we care about your privacy", - "cookie consent", - "consent required", -]; - -/// Detect why a page returned empty or near-empty content. -#[derive(Debug, PartialEq, Eq)] -enum EmptyReason { - /// Anti-bot challenge page (Cloudflare, Akamai, etc.) - Antibot, - /// GDPR/cookie consent redirect. - ConsentWall, - /// JS-only SPA that returns an empty shell without a browser - JsRequired, - /// Page has content. - None, -} - -fn detect_empty(result: &ExtractionResult) -> EmptyReason { - // Consent walls can have a tiny body, so check before the content - // short-circuit. - if is_consent_wall(result) { - return EmptyReason::ConsentWall; - } - - // Has real content. Nothing to warn about. - if result.metadata.word_count > 50 || !result.content.markdown.is_empty() { - return EmptyReason::None; - } - - // Check for known anti-bot challenge titles - if let Some(ref title) = result.metadata.title { - let lower = title.to_lowercase(); - if ANTIBOT_TITLES.iter().any(|t| lower.starts_with(t)) { - return EmptyReason::Antibot; - } - } - - // Empty content with no title or a generic SPA shell = JS-only site - if result.metadata.word_count == 0 && result.content.links.is_empty() { - return EmptyReason::JsRequired; - } - - EmptyReason::None -} - -/// A consent wall is identified by either: -/// 1. The final URL pointing at a known consent host/path, or -/// 2. A consent-wall title prefix with a very small body. -fn is_consent_wall(result: &ExtractionResult) -> bool { - if let Some(ref url) = result.metadata.url { - let lower = url.to_ascii_lowercase(); - if CONSENT_URL_FRAGMENTS - .iter() - .any(|fragment| lower.contains(fragment)) - { - return true; - } - } - - if result.metadata.word_count <= 50 - && let Some(ref title) = result.metadata.title - { - let lower = title.to_lowercase(); - if CONSENT_TITLES - .iter() - .any(|prefix| lower.starts_with(prefix)) - { - return true; - } - } - - false -} - -fn warn_empty(url: &str, reason: &EmptyReason) { - match reason { - EmptyReason::Antibot => eprintln!( - "\x1b[33mwarning:\x1b[0m Anti-bot protection detected on {url}\n\ - This site requires CAPTCHA solving or browser rendering.\n\ - Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing" - ), - EmptyReason::ConsentWall => eprintln!( - "\x1b[33mwarning:\x1b[0m GDPR/cookie consent wall detected on {url}\n\ - The site redirected to a consent page and returned no usable content.\n\ - Try a different region via --proxy, or pass a pre-accepted consent cookie\n\ - via --cookie / --cookie-file." - ), - EmptyReason::JsRequired => eprintln!( - "\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\ - This site requires JavaScript rendering (SPA).\n\ - Use the webclaw Cloud API for JS rendering: https://webclaw.io/pricing" - ), - EmptyReason::None => {} - } -} - -#[derive(Parser)] -#[command(name = "webclaw", about = "Extract web content for LLMs", version)] -struct Cli { - /// Optional subcommand. When omitted, the CLI falls back to the - /// traditional flag-based flow (URL + --format, --crawl, etc.). - /// Subcommands are used for flows that don't fit that model. - #[command(subcommand)] - command: Option, - - /// URLs to fetch (multiple allowed) - #[arg()] - urls: Vec, - - /// File with URLs (one per line) - #[arg(long)] - urls_file: Option, - - /// Output format (markdown, json, text, llm, html) - #[arg(short, long, default_value = "markdown")] - format: OutputFormat, - - /// Browser to impersonate - #[arg(short, long, default_value = "chrome")] - browser: Browser, - - /// Proxy URL (http://user:pass@host:port or socks5://host:port) - #[arg(short, long, env = "WEBCLAW_PROXY")] - proxy: Option, - - /// File with proxies (host:port:user:pass, one per line). Rotates per request. - #[arg(long, env = "WEBCLAW_PROXY_FILE")] - proxy_file: Option, - - /// Request timeout in seconds - #[arg(short, long, default_value = "30")] - timeout: u64, - - /// Extract from local HTML file instead of fetching - #[arg(long)] - file: Option, - - /// Read HTML from stdin - #[arg(long)] - stdin: bool, - - /// Include metadata in output (always included in JSON) - #[arg(long)] - metadata: bool, - - /// Output raw fetched HTML instead of extracting - #[arg(long)] - raw_html: bool, - - /// CSS selectors to include (comma-separated, e.g. "article,.content") - #[arg(long)] - include: Option, - - /// CSS selectors to exclude (comma-separated, e.g. "nav,.sidebar,footer") - #[arg(long)] - exclude: Option, - - /// Only extract main content (article/main element) - #[arg(long)] - only_main_content: bool, - - /// Custom headers (repeatable, e.g. -H "Cookie: foo=bar") - #[arg(short = 'H', long = "header")] - headers: Vec, - - /// Cookie string (shorthand for -H "Cookie: ...") - #[arg(long)] - cookie: Option, - - /// JSON cookie file (Chrome extension format: [{name, value, domain, ...}]) - #[arg(long)] - cookie_file: Option, - - /// Enable verbose logging - #[arg(short, long)] - verbose: bool, - - /// Compare against a previous JSON snapshot - #[arg(long)] - diff_with: Option, - - /// Watch a URL for changes. Checks at the specified interval and reports diffs. - #[arg(long)] - watch: bool, - - /// Watch interval in seconds [default: 300] - #[arg(long, default_value = "300")] - watch_interval: u64, - - /// Command to run when changes are detected (receives diff JSON on stdin) - #[arg(long)] - on_change: Option, - - /// Webhook URL: POST a JSON payload when an operation completes. - /// Works with crawl, batch, watch (on change), and single URL modes. - #[arg(long, env = "WEBCLAW_WEBHOOK_URL")] - webhook: Option, - - /// Extract brand identity (colors, fonts, logo) - #[arg(long)] - brand: bool, - - // -- PDF options -- - /// PDF extraction mode: auto (error on empty) or fast (return whatever text is found) - #[arg(long, default_value = "auto")] - pdf_mode: PdfModeArg, - - // -- Crawl options -- - /// Enable recursive crawling of same-domain links - #[arg(long)] - crawl: bool, - - /// Max crawl depth [default: 1] - #[arg(long, default_value = "1")] - depth: usize, - - /// Max pages to crawl [default: 20] - #[arg(long, default_value = "20")] - max_pages: usize, - - /// Max concurrent requests [default: 5] - #[arg(long, default_value = "5")] - concurrency: usize, - - /// Delay between requests in ms [default: 100] - #[arg(long, default_value = "100")] - delay: u64, - - /// Only crawl URLs matching this path prefix - #[arg(long)] - path_prefix: Option, - - /// Glob patterns for crawl URL paths to include (comma-separated, e.g. "/api/*,/guides/**") - #[arg(long)] - include_paths: Option, - - /// Glob patterns for crawl URL paths to exclude (comma-separated, e.g. "/changelog/*,/blog/*") - #[arg(long)] - exclude_paths: Option, - - /// Path to save/resume crawl state. On Ctrl+C: saves progress. On start: resumes if file exists. - #[arg(long)] - crawl_state: Option, - - /// Seed crawl frontier from sitemap discovery (robots.txt + /sitemap.xml) - #[arg(long)] - sitemap: bool, - - /// Discover URLs from sitemap and print them (one per line; JSON array with --format json) - #[arg(long)] - map: bool, - - // -- LLM options -- - /// Extract structured JSON using LLM (pass a JSON schema string or @file) - #[arg(long)] - extract_json: Option, - - /// Extract using natural language prompt - #[arg(long)] - extract_prompt: Option, - - /// Summarize content using LLM (optional: number of sentences, default 3) - #[arg(long, num_args = 0..=1, default_missing_value = "3")] - summarize: Option, - - /// Force a specific LLM provider (ollama, openai, anthropic) - #[arg(long, env = "WEBCLAW_LLM_PROVIDER")] - llm_provider: Option, - - /// Override the LLM model name - #[arg(long, env = "WEBCLAW_LLM_MODEL")] - llm_model: Option, - - /// Override the LLM base URL (Ollama, OpenAI-compatible, or Anthropic-compatible) - #[arg(long, env = "WEBCLAW_LLM_BASE_URL")] - llm_base_url: Option, - - // -- Cloud API options -- - /// Webclaw Cloud API key for automatic fallback on bot-protected or JS-rendered sites - #[arg(long, env = "WEBCLAW_API_KEY")] - api_key: Option, - - /// Force all requests through the cloud API (skip local extraction) - #[arg(long)] - cloud: bool, - - /// Run deep research on a topic via the cloud API. Requires --api-key. - /// Saves full result (report + sources + findings) to a JSON file. - #[arg(long)] - research: Option, - - /// Enable deep research mode (longer, more thorough report). Used with --research. - #[arg(long)] - deep: bool, - - /// Output directory: save each page to a separate file instead of stdout. - /// Works with --crawl, batch (multiple URLs), and single URL mode. - /// Filenames are derived from URL paths (e.g. /docs/api -> docs/api.md). - #[arg(long)] - output_dir: Option, -} - -#[derive(Subcommand)] -enum Commands { - /// Per-URL extraction micro-benchmark: compares raw HTML vs. the - /// webclaw --format llm output on token count, bytes, and - /// extraction time. Uses an approximate tokenizer (see `--help`). - Bench { - /// URL to benchmark. - url: String, - - /// Emit a single JSON line instead of the ASCII table. - /// Machine-readable shape stable across releases. - #[arg(long)] - json: bool, - - /// Optional path to a facts.json (same schema as the repo's - /// benchmarks/facts.json) for a fidelity column. - #[arg(long)] - facts: Option, - }, - - /// List all vertical extractors in the catalog. - /// - /// Each entry has a stable `name` (usable with `webclaw vertical `), - /// a human-friendly label, a one-line description, and the URL - /// patterns it claims. The same data is served by `/v1/extractors` - /// when running the REST API. - Extractors { - /// Emit JSON instead of a human-friendly table. - #[arg(long)] - json: bool, - }, - - /// Run a vertical extractor by name. Returns typed JSON with fields - /// specific to the target site (title, price, author, rating, etc.) - /// rather than generic markdown. - /// - /// Use `webclaw extractors` to see the full list. Example: - /// `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/`. - Vertical { - /// Vertical name (e.g. `reddit`, `github_repo`, `trustpilot_reviews`). - name: String, - /// URL to extract. - url: String, - /// Emit compact JSON (single line). Default is pretty-printed. - #[arg(long)] - raw: bool, - }, -} - -#[derive(Clone, ValueEnum)] -enum OutputFormat { - Markdown, - Json, - Text, - Llm, - Html, -} - -#[derive(Clone, ValueEnum)] -enum Browser { - Chrome, - Firefox, - /// Safari iOS 26. Pair with a country-matched residential proxy for sites - /// that reject non-mobile profiles. - SafariIos, - Random, -} - -#[derive(Clone, ValueEnum, Default)] -enum PdfModeArg { - /// Error if PDF has no extractable text (catches scanned PDFs) - #[default] - Auto, - /// Return whatever text is found, even if empty - Fast, -} - -impl From for PdfMode { - fn from(arg: PdfModeArg) -> Self { - match arg { - PdfModeArg::Auto => PdfMode::Auto, - PdfModeArg::Fast => PdfMode::Fast, - } - } -} - -impl From for BrowserProfile { - fn from(b: Browser) -> Self { - match b { - Browser::Chrome => BrowserProfile::Chrome, - Browser::Firefox => BrowserProfile::Firefox, - Browser::SafariIos => BrowserProfile::SafariIos, - Browser::Random => BrowserProfile::Random, - } - } -} fn init_logging(verbose: bool) { // html5ever / markup5ever / selectors emit WARN on common real-world HTML @@ -466,1981 +38,6 @@ fn init_logging(verbose: bool) { tracing_subscriber::fmt().with_env_filter(filter).init(); } -/// Build FetchConfig from CLI flags. -/// -/// `--proxy` sets a single static proxy (no rotation). -/// `--proxy-file` loads a pool of proxies and rotates per-request. -/// `--proxy` takes priority: if both are set, only the single proxy is used. -fn build_fetch_config(cli: &Cli) -> FetchConfig { - let (proxy, proxy_pool) = if cli.proxy.is_some() { - (cli.proxy.clone(), Vec::new()) - } else if let Some(ref path) = cli.proxy_file { - match webclaw_fetch::parse_proxy_file(path) { - Ok(pool) => (None, pool), - Err(e) => { - eprintln!("warning: {e}"); - (None, Vec::new()) - } - } - } else if std::path::Path::new("proxies.txt").exists() { - // Auto-load proxies.txt from working directory if present - match webclaw_fetch::parse_proxy_file("proxies.txt") { - Ok(pool) if !pool.is_empty() => { - eprintln!("loaded {} proxies from proxies.txt", pool.len()); - (None, pool) - } - _ => (None, Vec::new()), - } - } else { - (None, Vec::new()) - }; - - let mut headers = std::collections::HashMap::from([( - "Accept-Language".to_string(), - "en-US,en;q=0.9".to_string(), - )]); - - // Parse -H "Key: Value" flags - for h in &cli.headers { - if let Some((key, val)) = h.split_once(':') { - headers.insert(key.trim().to_string(), val.trim().to_string()); - } - } - - // --cookie shorthand - if let Some(ref cookie) = cli.cookie { - headers.insert("Cookie".to_string(), cookie.clone()); - } - - // --cookie-file: parse JSON array of {name, value, domain, ...} - if let Some(ref path) = cli.cookie_file { - match parse_cookie_file(path) { - Ok(cookie_str) => { - // Merge with existing cookies if --cookie was also provided - if let Some(existing) = headers.get("Cookie") { - headers.insert("Cookie".to_string(), format!("{existing}; {cookie_str}")); - } else { - headers.insert("Cookie".to_string(), cookie_str); - } - } - Err(e) => { - eprintln!("error: failed to parse cookie file: {e}"); - process::exit(1); - } - } - } - - FetchConfig { - browser: cli.browser.clone().into(), - proxy, - proxy_pool, - timeout: std::time::Duration::from_secs(cli.timeout), - pdf_mode: cli.pdf_mode.clone().into(), - headers, - ..Default::default() - } -} - -/// Parse a JSON cookie file (Chrome extension format) into a Cookie header string. -/// Supports: [{name, value, domain, path, secure, httpOnly, expirationDate, ...}] -fn parse_cookie_file(path: &str) -> Result { - let content = std::fs::read_to_string(path).map_err(|e| format!("cannot read {path}: {e}"))?; - let cookies: Vec = - serde_json::from_str(&content).map_err(|e| format!("invalid JSON: {e}"))?; - - let pairs: Vec = cookies - .iter() - .filter_map(|c| { - let name = c.get("name")?.as_str()?; - let value = c.get("value")?.as_str()?; - Some(format!("{name}={value}")) - }) - .collect(); - - if pairs.is_empty() { - return Err("no cookies found in file".to_string()); - } - - Ok(pairs.join("; ")) -} - -fn build_extraction_options(cli: &Cli) -> ExtractionOptions { - ExtractionOptions { - include_selectors: cli - .include - .as_deref() - .map(|s| s.split(',').map(|s| s.trim().to_string()).collect()) - .unwrap_or_default(), - exclude_selectors: cli - .exclude - .as_deref() - .map(|s| s.split(',').map(|s| s.trim().to_string()).collect()) - .unwrap_or_default(), - only_main_content: cli.only_main_content, - include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html), - } -} - -/// Normalize a URL: prepend `https://` if no scheme is present. -fn normalize_url(url: &str) -> String { - let trimmed = url.trim(); - if trimmed.contains("://") { - trimmed.to_string() - } else { - format!("https://{trimmed}") - } -} - -/// Derive a filename from a URL for `--output-dir`. -/// -/// Strips the scheme/host, maps the path to a filesystem path, and appends -/// an extension matching the output format. -fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String { - let ext = match format { - OutputFormat::Markdown | OutputFormat::Llm => "md", - OutputFormat::Json => "json", - OutputFormat::Text => "txt", - OutputFormat::Html => "html", - }; - - let parsed = url::Url::parse(raw_url); - let (host, path, query) = match &parsed { - Ok(u) => ( - u.host_str().unwrap_or("unknown").to_string(), - u.path().to_string(), - u.query().map(String::from), - ), - Err(_) => (String::new(), String::new(), None), - }; - - // Drop empty / "." / ".." path segments so a URL path like - // `/../../etc/passwd` can't climb out of the output directory. - let cleaned_path: String = path - .split('/') - .filter(|seg| !seg.is_empty() && *seg != "." && *seg != "..") - .collect::>() - .join("/"); - - let mut stem = cleaned_path; - if stem.is_empty() { - // Use hostname for root URLs to avoid collisions in batch mode - let clean_host = host.strip_prefix("www.").unwrap_or(&host); - stem = format!("{}/index", clean_host.replace('.', "_")); - } - - // Append query params so /p?id=123 doesn't collide with /p?id=456 - if let Some(q) = query { - stem = format!("{stem}_{q}"); - } - - // Sanitize: keep alphanumeric, dash, underscore, dot, slash - let sanitized: String = stem - .chars() - .map(|c| { - if c.is_alphanumeric() || matches!(c, '-' | '_' | '.' | '/') { - c - } else { - '_' - } - }) - .collect(); - - format!("{sanitized}.{ext}") -} - -/// Reject a caller-supplied (CSV `url,filename`) name that could escape the -/// output directory: absolute paths, drive prefixes, root, or any `..` -/// component. Returns the validated relative path on success. -fn safe_relative_filename(filename: &str) -> Result { - let candidate = Path::new(filename); - use std::path::Component; - for comp in candidate.components() { - match comp { - Component::Normal(_) | Component::CurDir => {} - Component::ParentDir => { - return Err(format!("refusing path with '..' component: {filename}")); - } - Component::RootDir | Component::Prefix(_) => { - return Err(format!("refusing absolute output path: {filename}")); - } - } - } - if candidate.as_os_str().is_empty() { - return Err("empty output filename".to_string()); - } - Ok(candidate.to_path_buf()) -} - -/// Write extraction output to a file inside `dir`, creating parent dirs as needed. -/// -/// `filename` may originate from an attacker-controlled `--urls-file` -/// (`url,filename` CSV). It is validated for traversal, and the canonical -/// destination directory is asserted to stay under the canonical output -/// directory before any write. -fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> { - let rel = safe_relative_filename(filename)?; - let dest = dir.join(&rel); - - std::fs::create_dir_all(dir) - .map_err(|e| format!("failed to create directory {}: {e}", dir.display()))?; - let base = dir - .canonicalize() - .map_err(|e| format!("failed to resolve output dir {}: {e}", dir.display()))?; - - if let Some(parent) = dest.parent() { - std::fs::create_dir_all(parent) - .map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?; - let canon_parent = parent - .canonicalize() - .map_err(|e| format!("failed to resolve {}: {e}", parent.display()))?; - if !canon_parent.starts_with(&base) { - return Err(format!( - "refusing to write outside output dir: {}", - dest.display() - )); - } - } - - std::fs::write(&dest, content) - .map_err(|e| format!("failed to write {}: {e}", dest.display()))?; - let word_count = content.split_whitespace().count(); - eprintln!("Saved: {} ({word_count} words)", dest.display()); - Ok(()) -} - -/// Get raw HTML from an extraction result, falling back to markdown if unavailable. -fn raw_html_or_markdown(result: &ExtractionResult) -> &str { - result - .content - .raw_html - .as_deref() - .unwrap_or(&result.content.markdown) -} - -/// Format an `ExtractionResult` into a string for the given output format. -fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String { - match format { - OutputFormat::Markdown => { - let mut out = String::new(); - if show_metadata { - out.push_str(&format_frontmatter(&result.metadata)); - } - out.push_str(&result.content.markdown); - if !result.structured_data.is_empty() { - out.push_str("\n\n## Structured Data\n\n```json\n"); - out.push_str( - &serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(), - ); - out.push_str("\n```"); - } - out - } - OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"), - OutputFormat::Text => result.content.plain_text.clone(), - OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()), - OutputFormat::Html => raw_html_or_markdown(result).to_string(), - } -} - -/// Collect all URLs from positional args + --urls-file, normalizing bare domains. -/// -/// Returns `(url, optional_custom_filename)` pairs. Custom filenames come from -/// CSV-style lines in `--urls-file`: `url,filename`. Plain lines (no comma) get -/// `None` so the caller auto-generates the filename from the URL. -fn collect_urls(cli: &Cli) -> Result)>, String> { - let mut entries: Vec<(String, Option)> = - cli.urls.iter().map(|u| (normalize_url(u), None)).collect(); - - if let Some(ref path) = cli.urls_file { - let content = - std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?; - for line in content.lines() { - let trimmed = line.trim(); - if trimmed.is_empty() || trimmed.starts_with('#') { - continue; - } - if let Some((url_part, name_part)) = trimmed.split_once(',') { - let name = name_part.trim(); - let custom = if name.is_empty() { - None - } else { - Some(name.to_string()) - }; - entries.push((normalize_url(url_part.trim()), custom)); - } else { - entries.push((normalize_url(trimmed), None)); - } - } - } - - Ok(entries) -} - -/// Result that can be either a local extraction or a cloud API JSON response. -enum FetchOutput { - Local(Box), - Cloud(serde_json::Value), -} - -impl FetchOutput { - /// Get the local ExtractionResult, or try to parse it from the cloud response. - fn into_extraction(self) -> Result { - match self { - FetchOutput::Local(r) => Ok(*r), - FetchOutput::Cloud(resp) => { - // Cloud response has an "extraction" field with the full ExtractionResult - resp.get("extraction") - .and_then(|v| serde_json::from_value(v.clone()).ok()) - .or_else(|| serde_json::from_value(resp.clone()).ok()) - .ok_or_else(|| "could not parse extraction from cloud response".to_string()) - } - } - } -} - -/// Fetch a URL and extract content, handling PDF detection automatically. -/// Falls back to cloud API when bot protection or JS rendering is detected. -async fn fetch_and_extract(cli: &Cli) -> Result { - // Local sources: read and extract as HTML - if cli.stdin { - let mut buf = String::new(); - io::stdin() - .read_to_string(&mut buf) - .map_err(|e| format!("failed to read stdin: {e}"))?; - let options = build_extraction_options(cli); - return extract_with_options(&buf, None, &options) - .map(|r| FetchOutput::Local(Box::new(r))) - .map_err(|e| format!("extraction error: {e}")); - } - - if let Some(ref path) = cli.file { - let html = - std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?; - let options = build_extraction_options(cli); - return extract_with_options(&html, None, &options) - .map(|r| FetchOutput::Local(Box::new(r))) - .map_err(|e| format!("extraction error: {e}")); - } - - let raw_url = cli - .urls - .first() - .ok_or("no input provided -- pass a URL, --file, or --stdin")?; - let url = normalize_url(raw_url); - let url = url.as_str(); - - let cloud_client = webclaw_fetch::cloud::CloudClient::new(cli.api_key.as_deref()); - - // --cloud: skip local, go straight to cloud API - if cli.cloud { - let c = - cloud_client.ok_or("--cloud requires WEBCLAW_API_KEY (set via env or --api-key)")?; - let options = build_extraction_options(cli); - let format_str = match cli.format { - OutputFormat::Markdown => "markdown", - OutputFormat::Json => "json", - OutputFormat::Text => "text", - OutputFormat::Llm => "llm", - OutputFormat::Html => "html", - }; - let resp = c - .scrape( - url, - &[format_str], - &options.include_selectors, - &options.exclude_selectors, - options.only_main_content, - ) - .await?; - return Ok(FetchOutput::Cloud(resp)); - } - - // Normal path: try local first - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; - let options = build_extraction_options(cli); - let result = client - .fetch_and_extract_with_options(url, &options) - .await - .map_err(|e| format!("fetch error: {e}"))?; - - // Check if we should fall back to cloud - let reason = detect_empty(&result); - if !matches!(reason, EmptyReason::None) { - if let Some(ref c) = cloud_client { - eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API..."); - let format_str = match cli.format { - OutputFormat::Markdown => "markdown", - OutputFormat::Json => "json", - OutputFormat::Text => "text", - OutputFormat::Llm => "llm", - OutputFormat::Html => "html", - }; - match c - .scrape( - url, - &[format_str], - &options.include_selectors, - &options.exclude_selectors, - options.only_main_content, - ) - .await - { - Ok(resp) => return Ok(FetchOutput::Cloud(resp)), - Err(e) => { - eprintln!("\x1b[33mwarning:\x1b[0m cloud fallback failed: {e}"); - // Fall through to return the local result with a warning - } - } - } - warn_empty(url, &reason); - } - - Ok(FetchOutput::Local(Box::new(result))) -} - -/// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction. -async fn fetch_html(cli: &Cli) -> Result { - if cli.stdin { - let mut buf = String::new(); - io::stdin() - .read_to_string(&mut buf) - .map_err(|e| format!("failed to read stdin: {e}"))?; - return Ok(FetchResult { - html: buf, - url: String::new(), - status: 200, - headers: Default::default(), - elapsed: Default::default(), - }); - } - - if let Some(ref path) = cli.file { - let html = - std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?; - return Ok(FetchResult { - html, - url: String::new(), - status: 200, - headers: Default::default(), - elapsed: Default::default(), - }); - } - - let raw_url = cli - .urls - .first() - .ok_or("no input provided -- pass a URL, --file, or --stdin")?; - let url = normalize_url(raw_url); - - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; - client - .fetch(&url) - .await - .map_err(|e| format!("fetch error: {e}")) -} - -/// Fetch external stylesheets referenced in HTML and inject them as `\n"); - } - } - - if extra_css.is_empty() { - return html.to_string(); - } - - if let Some(pos) = html.to_lowercase().find("") { - let mut enriched = String::with_capacity(html.len() + extra_css.len()); - enriched.push_str(&html[..pos]); - enriched.push_str(&extra_css); - enriched.push_str(&html[pos..]); - enriched - } else { - format!("{extra_css}{html}") - } -} - -fn format_frontmatter(meta: &Metadata) -> String { - let mut lines = vec!["---".to_string()]; - - if let Some(title) = &meta.title { - lines.push(format!("title: \"{title}\"")); - } - if let Some(author) = &meta.author { - lines.push(format!("author: \"{author}\"")); - } - if let Some(date) = &meta.published_date { - lines.push(format!("date: \"{date}\"")); - } - if let Some(url) = &meta.url { - lines.push(format!("source: \"{url}\"")); - } - if meta.word_count > 0 { - lines.push(format!("word_count: {}", meta.word_count)); - } - - lines.push("---".to_string()); - lines.push(String::new()); // blank line after frontmatter - lines.join("\n") -} - -fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) { - match format { - OutputFormat::Markdown => { - if show_metadata { - print!("{}", format_frontmatter(&result.metadata)); - } - println!("{}", result.content.markdown); - if !result.structured_data.is_empty() { - println!( - "\n## Structured Data\n\n```json\n{}\n```", - serde_json::to_string_pretty(&result.structured_data).unwrap_or_default() - ); - } - } - OutputFormat::Json => { - // serde_json::to_string_pretty won't fail on our types - println!( - "{}", - serde_json::to_string_pretty(result).expect("serialization failed") - ); - } - OutputFormat::Text => { - println!("{}", result.content.plain_text); - } - OutputFormat::Llm => { - println!("{}", to_llm_text(result, result.metadata.url.as_deref())); - } - OutputFormat::Html => { - println!("{}", raw_html_or_markdown(result)); - } - } -} - -/// Print cloud API response in the requested format. -fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) { - match format { - OutputFormat::Json => { - println!( - "{}", - serde_json::to_string_pretty(resp).expect("serialization failed") - ); - } - OutputFormat::Markdown => { - // Cloud response has content.markdown - if let Some(md) = resp - .get("content") - .and_then(|c| c.get("markdown")) - .and_then(|m| m.as_str()) - { - println!("{md}"); - } else if let Some(md) = resp.get("markdown").and_then(|m| m.as_str()) { - println!("{md}"); - } else { - println!( - "{}", - serde_json::to_string_pretty(resp).expect("serialization failed") - ); - } - } - OutputFormat::Text => { - if let Some(txt) = resp - .get("content") - .and_then(|c| c.get("plain_text")) - .and_then(|t| t.as_str()) - { - println!("{txt}"); - } else { - // Fallback to markdown or raw JSON - print_cloud_output(resp, &OutputFormat::Markdown); - } - } - OutputFormat::Llm => { - if let Some(llm) = resp - .get("content") - .and_then(|c| c.get("llm_text")) - .and_then(|t| t.as_str()) - { - println!("{llm}"); - } else { - print_cloud_output(resp, &OutputFormat::Markdown); - } - } - OutputFormat::Html => { - if let Some(html) = resp - .get("content") - .and_then(|c| c.get("raw_html")) - .and_then(|h| h.as_str()) - { - println!("{html}"); - } else { - print_cloud_output(resp, &OutputFormat::Markdown); - } - } - } -} - -fn print_diff_output(diff: &ContentDiff, format: &OutputFormat) { - match format { - OutputFormat::Json => { - println!( - "{}", - serde_json::to_string_pretty(diff).expect("serialization failed") - ); - } - // For markdown/text/llm, show a human-readable summary - _ => { - println!("Status: {:?}", diff.status); - println!("Word count delta: {:+}", diff.word_count_delta); - - if !diff.metadata_changes.is_empty() { - println!("\nMetadata changes:"); - for change in &diff.metadata_changes { - println!( - " {}: {} -> {}", - change.field, - change.old.as_deref().unwrap_or("(none)"), - change.new.as_deref().unwrap_or("(none)"), - ); - } - } - - if !diff.links_added.is_empty() { - println!("\nLinks added:"); - for link in &diff.links_added { - println!(" + {} ({})", link.href, link.text); - } - } - - if !diff.links_removed.is_empty() { - println!("\nLinks removed:"); - for link in &diff.links_removed { - println!(" - {} ({})", link.href, link.text); - } - } - - if let Some(ref text_diff) = diff.text_diff { - println!("\n{text_diff}"); - } - } - } -} - -fn print_crawl_output(result: &CrawlResult, format: &OutputFormat, show_metadata: bool) { - match format { - OutputFormat::Json => { - println!( - "{}", - serde_json::to_string_pretty(result).expect("serialization failed") - ); - } - OutputFormat::Markdown => { - for page in &result.pages { - let Some(ref extraction) = page.extraction else { - continue; - }; - println!("---"); - println!("# Page: {}\n", page.url); - if show_metadata { - print!("{}", format_frontmatter(&extraction.metadata)); - } - println!("{}", extraction.content.markdown); - println!(); - } - } - OutputFormat::Text => { - for page in &result.pages { - let Some(ref extraction) = page.extraction else { - continue; - }; - println!("---"); - println!("# Page: {}\n", page.url); - println!("{}", extraction.content.plain_text); - println!(); - } - } - OutputFormat::Llm => { - for page in &result.pages { - let Some(ref extraction) = page.extraction else { - continue; - }; - println!("---"); - println!("{}", to_llm_text(extraction, Some(page.url.as_str()))); - println!(); - } - } - OutputFormat::Html => { - for page in &result.pages { - let Some(ref extraction) = page.extraction else { - continue; - }; - println!("---"); - println!("\n", page.url); - println!("{}", raw_html_or_markdown(extraction)); - println!(); - } - } - } -} - -fn print_batch_output(results: &[BatchExtractResult], format: &OutputFormat, show_metadata: bool) { - match format { - OutputFormat::Json => { - // Build a JSON array of {url, result?, error?} objects - let entries: Vec = results - .iter() - .map(|r| match &r.result { - Ok(extraction) => serde_json::json!({ - "url": r.url, - "result": extraction, - }), - Err(e) => serde_json::json!({ - "url": r.url, - "error": e.to_string(), - }), - }) - .collect(); - println!( - "{}", - serde_json::to_string_pretty(&entries).expect("serialization failed") - ); - } - OutputFormat::Markdown => { - for r in results { - match &r.result { - Ok(extraction) => { - println!("---"); - println!("# {}\n", r.url); - if show_metadata { - print!("{}", format_frontmatter(&extraction.metadata)); - } - println!("{}", extraction.content.markdown); - println!(); - } - Err(e) => { - eprintln!("error: {} -- {}", r.url, e); - } - } - } - } - OutputFormat::Text => { - for r in results { - match &r.result { - Ok(extraction) => { - println!("---"); - println!("# {}\n", r.url); - println!("{}", extraction.content.plain_text); - println!(); - } - Err(e) => { - eprintln!("error: {} -- {}", r.url, e); - } - } - } - } - OutputFormat::Llm => { - for r in results { - match &r.result { - Ok(extraction) => { - println!("---"); - println!("{}", to_llm_text(extraction, Some(r.url.as_str()))); - println!(); - } - Err(e) => { - eprintln!("error: {} -- {}", r.url, e); - } - } - } - } - OutputFormat::Html => { - for r in results { - match &r.result { - Ok(extraction) => { - println!("---"); - println!("\n", r.url); - println!("{}", raw_html_or_markdown(extraction)); - println!(); - } - Err(e) => { - eprintln!("error: {} -- {}", r.url, e); - } - } - } - } - } -} - -fn print_map_output(entries: &[SitemapEntry], format: &OutputFormat) { - match format { - OutputFormat::Json => { - println!( - "{}", - serde_json::to_string_pretty(entries).expect("serialization failed") - ); - } - _ => { - for entry in entries { - println!("{}", entry.url); - } - } - } -} - -/// Format a streaming progress line for a completed page. -fn format_progress(page: &PageResult, index: usize, max_pages: usize) -> String { - let status = if page.error.is_some() { "ERR" } else { "OK " }; - let timing = format!("{}ms", page.elapsed.as_millis()); - let detail = if let Some(ref extraction) = page.extraction { - format!(", {} words", extraction.metadata.word_count) - } else if let Some(ref err) = page.error { - format!(" ({err})") - } else { - String::new() - }; - format!( - "[{index}/{max_pages}] {status} {} ({timing}{detail})", - page.url - ) -} - -async fn run_crawl(cli: &Cli) -> Result<(), String> { - let url = cli - .urls - .first() - .ok_or("--crawl requires a URL argument") - .map(|u| normalize_url(u))?; - let url = url.as_str(); - - if cli.file.is_some() || cli.stdin { - return Err("--crawl cannot be used with --file or --stdin".into()); - } - - let include_patterns: Vec = cli - .include_paths - .as_deref() - .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) - .unwrap_or_default(); - let exclude_patterns: Vec = cli - .exclude_paths - .as_deref() - .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) - .unwrap_or_default(); - - // Set up streaming progress channel - let (progress_tx, mut progress_rx) = tokio::sync::broadcast::channel::(100); - - // Set up cancel flag for Ctrl+C handling - let cancel_flag = Arc::new(AtomicBool::new(false)); - - // Register Ctrl+C handler when --crawl-state is set - let state_path = cli.crawl_state.clone(); - if state_path.is_some() { - let flag = Arc::clone(&cancel_flag); - tokio::spawn(async move { - tokio::signal::ctrl_c().await.ok(); - flag.store(true, Ordering::Relaxed); - eprintln!("\nCtrl+C received, saving crawl state..."); - }); - } - - let config = CrawlConfig { - fetch: build_fetch_config(cli), - max_depth: cli.depth, - max_pages: cli.max_pages, - concurrency: cli.concurrency, - delay: std::time::Duration::from_millis(cli.delay), - path_prefix: cli.path_prefix.clone(), - use_sitemap: cli.sitemap, - include_patterns, - exclude_patterns, - progress_tx: Some(progress_tx), - cancel_flag: Some(Arc::clone(&cancel_flag)), - allow_subdomains: false, - allow_external_links: false, - }; - - // Load resume state if --crawl-state file exists - let resume_state = state_path - .as_ref() - .and_then(|p| Crawler::load_state(p)) - .inspect(|s| { - eprintln!( - "Resuming crawl: {} pages already visited, {} URLs in frontier", - s.visited.len(), - s.frontier.len(), - ); - }); - - let max_pages = cli.max_pages; - let completed_offset = resume_state.as_ref().map_or(0, |s| s.completed_pages); - - // Spawn background task to print streaming progress to stderr - let progress_handle = tokio::spawn(async move { - let mut count = completed_offset; - while let Ok(page) = progress_rx.recv().await { - count += 1; - eprintln!("{}", format_progress(&page, count, max_pages)); - } - }); - - let crawler = Crawler::new(url, config).map_err(|e| format!("crawler error: {e}"))?; - let result = crawler.crawl(url, resume_state).await; - - // Drop the crawler (and its progress_tx clone) so the progress task finishes - drop(crawler); - let _ = progress_handle.await; - - // If cancelled via Ctrl+C and --crawl-state is set, save state for resume - let was_cancelled = cancel_flag.load(Ordering::Relaxed); - if was_cancelled { - if let Some(ref path) = state_path { - Crawler::save_state( - path, - url, - &result.visited, - &result.remaining_frontier, - completed_offset + result.pages.len(), - cli.max_pages, - cli.depth, - )?; - eprintln!( - "Crawl state saved to {} ({} pages completed). Resume with --crawl-state {}", - path.display(), - completed_offset + result.pages.len(), - path.display(), - ); - } - } else if let Some(ref path) = state_path { - // Crawl completed normally — clean up state file - if path.exists() { - let _ = std::fs::remove_file(path); - } - } - - // Log per-page errors and extraction warnings to stderr - for page in &result.pages { - if let Some(ref err) = page.error { - eprintln!("error: {} -- {}", page.url, err); - } else if let Some(ref extraction) = page.extraction { - let reason = detect_empty(extraction); - if !matches!(reason, EmptyReason::None) { - warn_empty(&page.url, &reason); - } - } - } - - if let Some(ref dir) = cli.output_dir { - let mut saved = 0usize; - for page in &result.pages { - if let Some(ref extraction) = page.extraction { - let filename = url_to_filename(&page.url, &cli.format); - let content = format_output(extraction, &cli.format, cli.metadata); - write_to_file(dir, &filename, &content)?; - saved += 1; - } - } - eprintln!("Saved {saved} files to {}", dir.display()); - } else { - print_crawl_output(&result, &cli.format, cli.metadata); - } - - eprintln!( - "Crawled {} pages ({} ok, {} errors) in {:.1}s", - result.total, result.ok, result.errors, result.elapsed_secs, - ); - - // Fire webhook on crawl complete - if let Some(ref webhook_url) = cli.webhook { - let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect(); - fire_webhook( - webhook_url, - &serde_json::json!({ - "event": "crawl_complete", - "total": result.total, - "ok": result.ok, - "errors": result.errors, - "elapsed_secs": result.elapsed_secs, - "urls": urls, - }), - ); - // Brief pause so the async webhook has time to fire - tokio::time::sleep(std::time::Duration::from_millis(500)).await; - } - - if result.errors > 0 { - Err(format!( - "{} of {} pages failed", - result.errors, result.total - )) - } else { - Ok(()) - } -} - -async fn run_map(cli: &Cli) -> Result<(), String> { - let url = cli - .urls - .first() - .ok_or("--map requires a URL argument") - .map(|u| normalize_url(u))?; - let url = url.as_str(); - - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; - - let entries = webclaw_fetch::sitemap::discover(&client, url) - .await - .map_err(|e| format!("sitemap discovery failed: {e}"))?; - - if entries.is_empty() { - eprintln!("no sitemap URLs found for {url}"); - } else { - eprintln!("discovered {} URLs", entries.len()); - } - - print_map_output(&entries, &cli.format); - Ok(()) -} - -async fn run_batch(cli: &Cli, entries: &[(String, Option)]) -> Result<(), String> { - let client = Arc::new( - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?, - ); - - let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect(); - let options = build_extraction_options(cli); - let results = client - .fetch_and_extract_batch_with_options(&urls, cli.concurrency, &options) - .await; - - let ok = results.iter().filter(|r| r.result.is_ok()).count(); - let errors = results.len() - ok; - - // Log errors and extraction warnings to stderr - for r in &results { - if let Err(ref e) = r.result { - eprintln!("error: {} -- {}", r.url, e); - } else if let Ok(ref extraction) = r.result { - let reason = detect_empty(extraction); - if !matches!(reason, EmptyReason::None) { - warn_empty(&r.url, &reason); - } - } - } - - // Build a lookup of custom filenames by URL - let custom_names: std::collections::HashMap<&str, &str> = entries - .iter() - .filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n))) - .collect(); - - if let Some(ref dir) = cli.output_dir { - let mut saved = 0usize; - for r in &results { - if let Ok(ref extraction) = r.result { - let filename = custom_names - .get(r.url.as_str()) - .map(|s| s.to_string()) - .unwrap_or_else(|| url_to_filename(&r.url, &cli.format)); - let content = format_output(extraction, &cli.format, cli.metadata); - write_to_file(dir, &filename, &content)?; - saved += 1; - } - } - eprintln!("Saved {saved} files to {}", dir.display()); - } else { - print_batch_output(&results, &cli.format, cli.metadata); - } - - eprintln!( - "Fetched {} URLs ({} ok, {} errors)", - results.len(), - ok, - errors - ); - - // Fire webhook on batch complete - if let Some(ref webhook_url) = cli.webhook { - let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect(); - fire_webhook( - webhook_url, - &serde_json::json!({ - "event": "batch_complete", - "total": results.len(), - "ok": ok, - "errors": errors, - "urls": urls, - }), - ); - tokio::time::sleep(std::time::Duration::from_millis(500)).await; - } - - if errors > 0 { - Err(format!("{errors} of {} URLs failed", results.len())) - } else { - Ok(()) - } -} - -fn timestamp() -> String { - let now = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_secs(); - let hours = (now % 86400) / 3600; - let minutes = (now % 3600) / 60; - let seconds = now % 60; - format!("{hours:02}:{minutes:02}:{seconds:02}") -} - -/// Spawn the `--on-change` command with `payload` on stdin. -/// -/// Previously this passed the entire user-provided string to `sh -c`, which -/// made `--on-change 'notify "$URL"; rm -rf /'` a plausible disaster the -/// moment an untrusted config file or MCP-driven agent fed us a command. -/// The MCP surface specifically is prompt-injection-exposed: an LLM that -/// controls CLI args can escalate into arbitrary shell on the host. -/// -/// We now parse the command with `shlex` (POSIX-ish tokenization with proper -/// quoting) and exec the program directly without an intermediate shell, so -/// metacharacters like `;`, `&&`, `|`, `$()`, and env expansion can't fire. -/// Users who genuinely need a pipeline can set the whole chain behind a -/// script they've written, or opt in per-call via `WEBCLAW_ALLOW_SHELL=1` -/// (documented escape hatch, noisy by design). -async fn spawn_on_change(cmd: &str, stdin_payload: &[u8]) { - eprintln!("[watch] Running: {cmd}"); - - let allow_shell = std::env::var("WEBCLAW_ALLOW_SHELL") - .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) - .unwrap_or(false); - - let mut command = if allow_shell { - eprintln!("[watch] WEBCLAW_ALLOW_SHELL=1 — executing via sh -c (unsafe)"); - let mut c = tokio::process::Command::new("sh"); - c.arg("-c").arg(cmd); - c - } else { - let Some(argv) = shlex::split(cmd) else { - eprintln!("[watch] Failed to parse --on-change command (unbalanced quotes?)"); - return; - }; - let Some((program, args)) = argv.split_first() else { - eprintln!("[watch] --on-change command is empty"); - return; - }; - let mut c = tokio::process::Command::new(program); - c.args(args); - c - }; - - command.stdin(std::process::Stdio::piped()); - - match command.spawn() { - Ok(mut child) => { - if let Some(mut stdin) = child.stdin.take() { - use tokio::io::AsyncWriteExt; - let _ = stdin.write_all(stdin_payload).await; - } - } - Err(e) => eprintln!("[watch] Failed to run command: {e}"), - } -} - -/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr. -/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly. -fn fire_webhook(url: &str, payload: &serde_json::Value) { - let url = url.to_string(); - let is_discord = url.contains("discord.com/api/webhooks"); - let is_slack = url.contains("hooks.slack.com"); - - let body = if is_discord { - let event = payload - .get("event") - .and_then(|v| v.as_str()) - .unwrap_or("notification"); - let details = serde_json::to_string_pretty(payload).unwrap_or_default(); - serde_json::json!({ - "embeds": [{ - "title": format!("webclaw: {event}"), - "description": format!("```json\n{details}\n```"), - "color": 5814783 - }] - }) - .to_string() - } else if is_slack { - let event = payload - .get("event") - .and_then(|v| v.as_str()) - .unwrap_or("notification"); - let details = serde_json::to_string_pretty(payload).unwrap_or_default(); - serde_json::json!({ - "text": format!("*webclaw: {event}*\n```{details}```") - }) - .to_string() - } else { - serde_json::to_string(payload).unwrap_or_default() - }; - tokio::spawn(async move { - // SSRF guard: a webhook URL is user-supplied and otherwise bypasses - // the fetch-layer protections, so resolve + reject private/internal - // destinations before sending the payload. - if let Err(e) = webclaw_fetch::url_security::validate_public_http_url(&url).await { - eprintln!("[webhook] refusing unsafe URL: {e}"); - return; - } - match reqwest::Client::builder() - .timeout(std::time::Duration::from_secs(10)) - .build() - { - Ok(c) => match c - .post(&url) - .header("Content-Type", "application/json") - .body(body) - .send() - .await - { - Ok(resp) => { - eprintln!( - "[webhook] POST {} -> {}", - &url[..url.len().min(60)], - resp.status() - ); - } - Err(e) => eprintln!("[webhook] POST failed: {e}"), - }, - Err(e) => eprintln!("[webhook] client error: {e}"), - } - }); -} - -async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> { - if urls.is_empty() { - return Err("--watch requires at least one URL".into()); - } - - let client = Arc::new( - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?, - ); - let options = build_extraction_options(cli); - - // Ctrl+C handler - let cancelled = Arc::new(AtomicBool::new(false)); - let flag = Arc::clone(&cancelled); - tokio::spawn(async move { - tokio::signal::ctrl_c().await.ok(); - flag.store(true, Ordering::Relaxed); - }); - - // Single-URL mode: preserve original behavior exactly - if urls.len() == 1 { - return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await; - } - - // Multi-URL mode: batch fetch, diff each, report aggregate - run_watch_multi(cli, &client, &options, urls, &cancelled).await -} - -/// Original single-URL watch loop -- backward compatible. -async fn run_watch_single( - cli: &Cli, - client: &Arc, - options: &ExtractionOptions, - url: &str, - cancelled: &Arc, -) -> Result<(), String> { - let mut previous = client - .fetch_and_extract_with_options(url, options) - .await - .map_err(|e| format!("initial fetch failed: {e}"))?; - - eprintln!( - "[watch] Initial snapshot: {url} ({} words)", - previous.metadata.word_count - ); - - loop { - // Clamp to >=1s: `--watch-interval 0` would otherwise spin the - // fetch loop with zero delay and hammer the target. - tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await; - - if cancelled.load(Ordering::Relaxed) { - eprintln!("[watch] Stopped"); - break; - } - - let current = match client.fetch_and_extract_with_options(url, options).await { - Ok(result) => result, - Err(e) => { - eprintln!("[watch] Fetch error ({}): {e}", timestamp()); - continue; - } - }; - - let diff = webclaw_core::diff::diff(&previous, ¤t); - - if diff.status == ChangeStatus::Same { - eprintln!("[watch] No changes ({})", timestamp()); - } else { - print_diff_output(&diff, &cli.format); - eprintln!("[watch] Changes detected! ({})", timestamp()); - - if let Some(ref cmd) = cli.on_change { - let diff_json = serde_json::to_string(&diff).unwrap_or_default(); - spawn_on_change(cmd, diff_json.as_bytes()).await; - } - - if let Some(ref webhook_url) = cli.webhook { - fire_webhook( - webhook_url, - &serde_json::json!({ - "event": "watch_change", - "url": url, - "status": format!("{:?}", diff.status), - "word_count_delta": diff.word_count_delta, - "metadata_changes": diff.metadata_changes.len(), - "links_added": diff.links_added.len(), - "links_removed": diff.links_removed.len(), - }), - ); - } - - previous = current; - } - } - - Ok(()) -} - -/// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate. -async fn run_watch_multi( - cli: &Cli, - client: &Arc, - options: &ExtractionOptions, - urls: &[String], - cancelled: &Arc, -) -> Result<(), String> { - let url_refs: Vec<&str> = urls.iter().map(|u| u.as_str()).collect(); - - // Initial pass: fetch all URLs in parallel - let initial_results = client - .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options) - .await; - - let mut snapshots = std::collections::HashMap::new(); - let mut ok_count = 0usize; - let mut err_count = 0usize; - - for r in initial_results { - match r.result { - Ok(extraction) => { - snapshots.insert(r.url, extraction); - ok_count += 1; - } - Err(e) => { - eprintln!("[watch] Initial fetch error: {} -- {e}", r.url); - err_count += 1; - } - } - } - - eprintln!( - "[watch] Watching {} URLs (interval: {}s)", - urls.len(), - cli.watch_interval - ); - eprintln!("[watch] Initial snapshots: {ok_count} ok, {err_count} errors"); - - let mut check_number = 0u64; - - loop { - // Clamp to >=1s: `--watch-interval 0` would otherwise spin the - // fetch loop with zero delay and hammer the target. - tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await; - - if cancelled.load(Ordering::Relaxed) { - eprintln!("[watch] Stopped"); - break; - } - - check_number += 1; - - let current_results = client - .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options) - .await; - - let mut changed: Vec = Vec::new(); - let mut same_count = 0usize; - let mut fetch_errors = 0usize; - - for r in current_results { - match r.result { - Ok(current) => { - if let Some(previous) = snapshots.get(&r.url) { - let diff = webclaw_core::diff::diff(previous, ¤t); - if diff.status == ChangeStatus::Same { - same_count += 1; - } else { - changed.push(serde_json::json!({ - "url": r.url, - "word_count_delta": diff.word_count_delta, - })); - snapshots.insert(r.url, current); - } - } else { - // URL failed initially, first successful fetch -- store as baseline - snapshots.insert(r.url, current); - same_count += 1; - } - } - Err(e) => { - eprintln!("[watch] Fetch error: {} -- {e}", r.url); - fetch_errors += 1; - } - } - } - - let ts = timestamp(); - let err_suffix = if fetch_errors > 0 { - format!(", {fetch_errors} errors") - } else { - String::new() - }; - - if changed.is_empty() { - eprintln!( - "[watch] Check {check_number} ({ts}): 0 changed, {same_count} same{err_suffix}" - ); - } else { - eprintln!( - "[watch] Check {check_number} ({ts}): {} changed, {same_count} same{err_suffix}", - changed.len(), - ); - for entry in &changed { - let url = entry["url"].as_str().unwrap_or("?"); - let delta = entry["word_count_delta"].as_i64().unwrap_or(0); - eprintln!(" -> {url} (word delta: {delta:+})"); - } - - // Fire --on-change once with all changes - if let Some(ref cmd) = cli.on_change { - let payload = serde_json::json!({ - "event": "watch_changes", - "check_number": check_number, - "total_urls": urls.len(), - "changed": changed.len(), - "same": same_count, - "changes": changed, - }); - let payload_json = serde_json::to_string(&payload).unwrap_or_default(); - spawn_on_change(cmd, payload_json.as_bytes()).await; - } - - // Fire webhook once with aggregate payload - if let Some(ref webhook_url) = cli.webhook { - fire_webhook( - webhook_url, - &serde_json::json!({ - "event": "watch_changes", - "check_number": check_number, - "total_urls": urls.len(), - "changed": changed.len(), - "same": same_count, - "changes": changed, - }), - ); - } - } - } - - Ok(()) -} - -async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> { - // Load previous snapshot - let snapshot_json = std::fs::read_to_string(snapshot_path) - .map_err(|e| format!("failed to read snapshot {snapshot_path}: {e}"))?; - let old: ExtractionResult = serde_json::from_str(&snapshot_json) - .map_err(|e| format!("failed to parse snapshot JSON: {e}"))?; - - // Extract current version (handles PDF detection for URLs) - let new_result = fetch_and_extract(cli).await?.into_extraction()?; - - let diff = webclaw_core::diff::diff(&old, &new_result); - print_diff_output(&diff, &cli.format); - - Ok(()) -} - -async fn run_brand(cli: &Cli) -> Result<(), String> { - let result = fetch_html(cli).await?; - let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await; - let brand = webclaw_core::brand::extract_brand( - &enriched, - Some(result.url.as_str()).filter(|s| !s.is_empty()), - ); - println!( - "{}", - serde_json::to_string_pretty(&brand).expect("serialization failed") - ); - Ok(()) -} - -/// Build an LLM provider based on CLI flags, or fall back to the default chain. -async fn build_llm_provider(cli: &Cli) -> Result, String> { - if let Some(ref name) = cli.llm_provider { - match name.as_str() { - "ollama" => { - let provider = webclaw_llm::providers::ollama::OllamaProvider::new( - cli.llm_base_url.clone(), - cli.llm_model.clone(), - ); - if !provider.is_available().await { - return Err("ollama is not running or unreachable".into()); - } - Ok(Box::new(provider)) - } - "openai" => { - let provider = webclaw_llm::providers::openai::OpenAiProvider::new( - None, - cli.llm_base_url.clone(), - cli.llm_model.clone(), - ) - .ok_or("OPENAI_API_KEY not set")?; - Ok(Box::new(provider)) - } - "anthropic" => { - let provider = webclaw_llm::providers::anthropic::AnthropicProvider::with_base_url( - None, - cli.llm_base_url.clone(), - cli.llm_model.clone(), - ) - .ok_or("ANTHROPIC_API_KEY not set")?; - Ok(Box::new(provider)) - } - other => Err(format!( - "unknown LLM provider: {other} (use ollama, openai, or anthropic)" - )), - } - } else { - let chain = webclaw_llm::ProviderChain::default().await; - if chain.is_empty() { - return Err( - "no LLM providers available -- start Ollama or set OPENAI_API_KEY / ANTHROPIC_API_KEY" - .into(), - ); - } - Ok(Box::new(chain)) - } -} - -async fn run_llm(cli: &Cli) -> Result<(), String> { - // Extract content from source first (handles PDF detection for URLs) - let result = fetch_and_extract(cli).await?.into_extraction()?; - - let provider = build_llm_provider(cli).await?; - let model = cli.llm_model.as_deref(); - - if let Some(ref schema_input) = cli.extract_json { - // Support @file syntax for loading schema from file - let schema_str = if let Some(path) = schema_input.strip_prefix('@') { - std::fs::read_to_string(path) - .map_err(|e| format!("failed to read schema file {path}: {e}"))? - } else { - schema_input.clone() - }; - - let schema: serde_json::Value = - serde_json::from_str(&schema_str).map_err(|e| format!("invalid JSON schema: {e}"))?; - - let extracted = webclaw_llm::extract::extract_json( - &result.content.plain_text, - &schema, - provider.as_ref(), - model, - ) - .await - .map_err(|e| format!("LLM extraction failed: {e}"))?; - - println!( - "{}", - serde_json::to_string_pretty(&extracted).expect("serialization failed") - ); - } else if let Some(ref prompt) = cli.extract_prompt { - let extracted = webclaw_llm::extract::extract_with_prompt( - &result.content.plain_text, - prompt, - provider.as_ref(), - model, - ) - .await - .map_err(|e| format!("LLM extraction failed: {e}"))?; - - println!( - "{}", - serde_json::to_string_pretty(&extracted).expect("serialization failed") - ); - } else if let Some(sentences) = cli.summarize { - let summary = webclaw_llm::summarize::summarize( - &result.content.plain_text, - Some(sentences), - provider.as_ref(), - model, - ) - .await - .map_err(|e| format!("LLM summarization failed: {e}"))?; - - println!("{summary}"); - } - - Ok(()) -} - -/// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results. -/// URLs are processed sequentially to respect LLM provider rate limits. -async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Result<(), String> { - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; - let options = build_extraction_options(cli); - let provider = build_llm_provider(cli).await?; - let model = cli.llm_model.as_deref(); - - // Pre-parse schema once if --extract-json is used - let schema = if let Some(ref schema_input) = cli.extract_json { - let schema_str = if let Some(path) = schema_input.strip_prefix('@') { - std::fs::read_to_string(path) - .map_err(|e| format!("failed to read schema file {path}: {e}"))? - } else { - schema_input.clone() - }; - Some( - serde_json::from_str::(&schema_str) - .map_err(|e| format!("invalid JSON schema: {e}"))?, - ) - } else { - None - }; - - // Build custom filename lookup from entries - let custom_names: std::collections::HashMap<&str, &str> = entries - .iter() - .filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n))) - .collect(); - - let total = entries.len(); - let mut ok = 0usize; - let mut errors = 0usize; - let mut all_results: Vec = Vec::with_capacity(total); - - for (i, (url, _)) in entries.iter().enumerate() { - let idx = i + 1; - eprint!("[{idx}/{total}] {url} "); - - // Fetch and extract page content - let extraction = match client.fetch_and_extract_with_options(url, &options).await { - Ok(r) => r, - Err(e) => { - errors += 1; - let msg = format!("fetch failed: {e}"); - eprintln!("-> error: {msg}"); - all_results.push(serde_json::json!({ "url": url, "error": msg })); - continue; - } - }; - - let text = &extraction.content.plain_text; - - // Run the appropriate LLM operation - let llm_result = if let Some(ref schema) = schema { - webclaw_llm::extract::extract_json(text, schema, provider.as_ref(), model) - .await - .map(LlmOutput::Json) - } else if let Some(ref prompt) = cli.extract_prompt { - webclaw_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model) - .await - .map(LlmOutput::Json) - } else if let Some(sentences) = cli.summarize { - webclaw_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model) - .await - .map(LlmOutput::Text) - } else { - unreachable!("run_batch_llm called without LLM flags") - }; - - match llm_result { - Ok(output) => { - ok += 1; - - let (output_str, result_json) = match &output { - LlmOutput::Json(v) => { - let s = serde_json::to_string_pretty(v).expect("serialization failed"); - let j = serde_json::json!({ "url": url, "result": v }); - (s, j) - } - LlmOutput::Text(s) => { - let j = serde_json::json!({ "url": url, "result": s }); - (s.clone(), j) - } - }; - - // Count top-level fields/items for progress display - let detail = match &output { - LlmOutput::Json(v) => match v { - serde_json::Value::Object(m) => format!("{} fields", m.len()), - serde_json::Value::Array(a) => format!("{} items", a.len()), - _ => "done".to_string(), - }, - LlmOutput::Text(s) => { - let words = s.split_whitespace().count(); - format!("{words} words") - } - }; - eprintln!("-> extracted {detail}"); - - if let Some(ref dir) = cli.output_dir { - let filename = custom_names - .get(url.as_str()) - .map(|s| s.to_string()) - .unwrap_or_else(|| url_to_filename(url, &OutputFormat::Json)); - write_to_file(dir, &filename, &output_str)?; - } else { - println!("--- {url}"); - println!("{output_str}"); - println!(); - } - - all_results.push(result_json); - } - Err(e) => { - errors += 1; - let msg = format!("LLM extraction failed: {e}"); - eprintln!("-> error: {msg}"); - all_results.push(serde_json::json!({ "url": url, "error": msg })); - } - } - } - - eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)"); - - if let Some(ref webhook_url) = cli.webhook { - fire_webhook( - webhook_url, - &serde_json::json!({ - "event": "batch_llm_complete", - "total": total, - "ok": ok, - "errors": errors, - }), - ); - tokio::time::sleep(std::time::Duration::from_millis(500)).await; - } - - if errors > 0 { - Err(format!("{errors} of {total} URLs failed")) - } else { - Ok(()) - } -} - -/// Intermediate type to hold LLM output before formatting. -enum LlmOutput { - Json(serde_json::Value), - Text(String), -} - -/// Returns true if any LLM flag is set. -fn has_llm_flags(cli: &Cli) -> bool { - cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some() -} - -async fn run_research(cli: &Cli, query: &str) -> Result<(), String> { - let api_key = cli - .api_key - .as_deref() - .ok_or("--research requires WEBCLAW_API_KEY (set via env or --api-key)")?; - - let client = reqwest::Client::builder() - .timeout(std::time::Duration::from_secs(600)) - .build() - .map_err(|e| format!("http client error: {e}"))?; - - let mut body = serde_json::json!({ "query": query }); - if cli.deep { - body["deep"] = serde_json::json!(true); - } - - eprintln!("Starting research: {query}"); - if cli.deep { - eprintln!("Deep mode enabled (longer, more thorough)"); - } - - // Start job - let resp = client - .post("https://api.webclaw.io/v1/research") - .header("Authorization", format!("Bearer {api_key}")) - .json(&body) - .send() - .await - .map_err(|e| format!("API error: {e}"))? - .json::() - .await - .map_err(|e| format!("parse error: {e}"))?; - - let job_id = resp - .get("id") - .and_then(|v| v.as_str()) - .ok_or("API did not return a job ID")? - .to_string(); - - eprintln!("Job started: {job_id}"); - - // Poll - for poll in 0..200 { - tokio::time::sleep(std::time::Duration::from_secs(3)).await; - - let status_resp = client - .get(format!("https://api.webclaw.io/v1/research/{job_id}")) - .header("Authorization", format!("Bearer {api_key}")) - .send() - .await - .map_err(|e| format!("poll error: {e}"))? - .json::() - .await - .map_err(|e| format!("parse error: {e}"))?; - - let status = status_resp - .get("status") - .and_then(|v| v.as_str()) - .unwrap_or("unknown"); - - match status { - "completed" => { - let report = status_resp - .get("report") - .and_then(|v| v.as_str()) - .unwrap_or(""); - - // Save full result to JSON file - let slug: String = query - .chars() - .map(|c| { - if c.is_alphanumeric() || c == ' ' { - c - } else { - ' ' - } - }) - .collect::() - .split_whitespace() - .collect::>() - .join("-") - .to_lowercase(); - // char-safe truncation: byte slicing panics if char 50 - // lands mid-codepoint (multibyte queries). - let slug: String = slug.chars().take(50).collect(); - let filename = format!("research-{slug}.json"); - - let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default(); - std::fs::write(&filename, &json) - .map_err(|e| format!("failed to write {filename}: {e}"))?; - - let elapsed = status_resp - .get("elapsed_ms") - .and_then(|v| v.as_i64()) - .unwrap_or(0); - let sources = status_resp - .get("sources_count") - .and_then(|v| v.as_i64()) - .unwrap_or(0); - let findings = status_resp - .get("findings_count") - .and_then(|v| v.as_i64()) - .unwrap_or(0); - - eprintln!( - "Research complete: {sources} sources, {findings} findings, {:.1}s", - elapsed as f64 / 1000.0 - ); - eprintln!("Saved to: {filename}"); - - // Print report to stdout - if !report.is_empty() { - println!("{report}"); - } - - return Ok(()); - } - "failed" => { - let error = status_resp - .get("error") - .and_then(|v| v.as_str()) - .unwrap_or("unknown error"); - return Err(format!("Research failed: {error}")); - } - _ => { - if poll % 10 == 9 { - eprintln!("Still researching... ({:.0}s)", (poll + 1) as f64 * 3.0); - } - } - } - } - - Err(format!( - "Research timed out after ~10 minutes. Check status: GET /v1/research/{job_id}" - )) -} - #[tokio::main] async fn main() { dotenvy::dotenv().ok(); @@ -2680,226 +277,3 @@ async fn main() { } } } - -#[cfg(test)] -mod tests { - use super::*; - use webclaw_core::Content; - - fn empty_result(title: Option<&str>, url: Option<&str>, markdown: &str) -> ExtractionResult { - ExtractionResult { - metadata: Metadata { - title: title.map(str::to_string), - description: None, - author: None, - published_date: None, - language: None, - url: url.map(str::to_string), - site_name: None, - image: None, - favicon: None, - word_count: markdown.split_whitespace().count(), - }, - content: Content { - markdown: markdown.to_string(), - plain_text: markdown.to_string(), - links: vec![], - images: vec![], - code_blocks: vec![], - raw_html: None, - }, - domain_data: None, - structured_data: vec![], - } - } - - #[test] - fn detect_empty_identifies_consent_redirect_url() { - let result = empty_result( - Some("Yahoo"), - Some("https://guce.advertising.com/collectIdentifiers?sessionId=abc"), - "Continue", - ); - assert_eq!(detect_empty(&result), EmptyReason::ConsentWall); - } - - #[test] - fn detect_empty_identifies_short_consent_title() { - let result = empty_result( - Some("Before you continue"), - Some("https://www.google.com/"), - "Review privacy options", - ); - assert_eq!(detect_empty(&result), EmptyReason::ConsentWall); - } - - #[test] - fn detect_empty_does_not_flag_real_content_with_consent_words() { - let result = empty_result( - Some("Cookie consent patterns explained"), - Some("https://example.com/blog"), - "This article explains cookie consent patterns for product teams with enough real body text to be useful. It covers consent banners, privacy controls, analytics configuration, regional requirements, product tradeoffs, implementation details, testing flows, debugging notes, accessibility needs, and operational lessons from real teams shipping public websites across multiple markets. It also explains measurement, rollout planning, copy review, support workflows, design constraints, release notes, and how to keep privacy choices understandable for users.", - ); - assert_eq!(detect_empty(&result), EmptyReason::None); - } - - #[test] - fn url_to_filename_root() { - assert_eq!( - url_to_filename("https://example.com/", &OutputFormat::Markdown), - "example_com/index.md" - ); - assert_eq!( - url_to_filename("https://example.com", &OutputFormat::Markdown), - "example_com/index.md" - ); - } - - #[test] - fn url_to_filename_path() { - assert_eq!( - url_to_filename("https://example.com/docs/api", &OutputFormat::Markdown), - "docs/api.md" - ); - } - - #[test] - fn url_to_filename_trailing_slash() { - assert_eq!( - url_to_filename("https://example.com/docs/api/", &OutputFormat::Markdown), - "docs/api.md" - ); - } - - #[test] - fn url_to_filename_nested_path() { - assert_eq!( - url_to_filename("https://example.com/blog/my-post", &OutputFormat::Markdown), - "blog/my-post.md" - ); - } - - #[test] - fn url_to_filename_query_params() { - assert_eq!( - url_to_filename("https://example.com/p?id=123", &OutputFormat::Markdown), - "p_id_123.md" - ); - } - - #[test] - fn url_to_filename_json_format() { - assert_eq!( - url_to_filename("https://example.com/docs/api", &OutputFormat::Json), - "docs/api.json" - ); - } - - #[test] - fn url_to_filename_text_format() { - assert_eq!( - url_to_filename("https://example.com/docs/api", &OutputFormat::Text), - "docs/api.txt" - ); - } - - #[test] - fn url_to_filename_llm_format() { - assert_eq!( - url_to_filename("https://example.com/docs/api", &OutputFormat::Llm), - "docs/api.md" - ); - } - - #[test] - fn url_to_filename_html_format() { - assert_eq!( - url_to_filename("https://example.com/docs/api", &OutputFormat::Html), - "docs/api.html" - ); - } - - #[test] - fn url_to_filename_special_chars() { - // Spaces and special chars get replaced with underscores - assert_eq!( - url_to_filename( - "https://example.com/path%20with%20spaces", - &OutputFormat::Markdown - ), - "path_20with_20spaces.md" - ); - } - - #[test] - fn write_to_file_creates_dirs() { - let dir = std::env::temp_dir().join("webclaw_test_output_dir"); - let _ = std::fs::remove_dir_all(&dir); - write_to_file(&dir, "nested/deep/file.md", "hello").unwrap(); - let content = std::fs::read_to_string(dir.join("nested/deep/file.md")).unwrap(); - assert_eq!(content, "hello"); - let _ = std::fs::remove_dir_all(&dir); - } - - #[test] - fn url_to_filename_strips_traversal_segments() { - // `..` / `.` / empty path segments must not survive into the path. - let out = url_to_filename( - "https://example.com/../../etc/passwd", - &OutputFormat::Markdown, - ); - assert!(!out.contains(".."), "traversal leaked: {out}"); - assert_eq!(out, "etc/passwd.md"); - let out2 = url_to_filename("https://example.com/a/./b//c", &OutputFormat::Json); - assert_eq!(out2, "a/b/c.json"); - } - - #[test] - fn safe_relative_filename_rejects_escapes() { - assert!(safe_relative_filename("../escape.md").is_err()); - assert!(safe_relative_filename("a/../../b.md").is_err()); - assert!(safe_relative_filename("/etc/passwd").is_err()); - assert!(safe_relative_filename("").is_err()); - // Normal nested relative names stay allowed. - assert!(safe_relative_filename("nested/deep/file.md").is_ok()); - assert!(safe_relative_filename("./ok.md").is_ok()); - } - - #[test] - fn write_to_file_refuses_traversal_filename() { - let dir = std::env::temp_dir().join("webclaw_test_traversal_dir"); - let _ = std::fs::remove_dir_all(&dir); - // CSV-supplied `url,filename` traversal attempt. - let err = write_to_file(&dir, "../../tmp/webclaw_pwned.md", "x").unwrap_err(); - assert!(err.contains("refusing"), "unexpected error: {err}"); - assert!( - !std::path::Path::new("/tmp/webclaw_pwned.md").exists(), - "traversal write escaped the output dir" - ); - let _ = std::fs::remove_dir_all(&dir); - } - - #[test] - fn research_slug_truncation_is_char_safe() { - // Multibyte query: byte-slicing at 50 would panic mid-codepoint. - let query = "日本語".repeat(40); // 120 chars, 3 bytes each - let slug: String = query - .chars() - .map(|c| { - if c.is_alphanumeric() || c == ' ' { - c - } else { - ' ' - } - }) - .collect::() - .split_whitespace() - .collect::>() - .join("-") - .to_lowercase(); - let slug: String = slug.chars().take(50).collect(); - assert!(slug.chars().count() <= 50); - // Round-trips through formatting without panicking. - let _ = format!("research-{slug}.json"); - } -} diff --git a/crates/webclaw-cli/src/output.rs b/crates/webclaw-cli/src/output.rs new file mode 100644 index 0000000..69349ee --- /dev/null +++ b/crates/webclaw-cli/src/output.rs @@ -0,0 +1,376 @@ +//! Output formatting and rendering for every CLI mode. +//! +//! `render_one` is the single source of truth for turning one +//! `ExtractionResult` into a standalone document for a given format. The +//! `print_*`/`format_*` functions own iteration and separator logic and +//! delegate the per-page body to `render_one`. + +use webclaw_core::{ContentDiff, ExtractionResult, Metadata, to_llm_text}; +use webclaw_fetch::{BatchExtractResult, CrawlResult, PageResult, SitemapEntry}; + +use crate::cli::OutputFormat; + +/// Get raw HTML from an extraction result, falling back to markdown if unavailable. +pub fn raw_html_or_markdown(result: &ExtractionResult) -> &str { + result + .content + .raw_html + .as_deref() + .unwrap_or(&result.content.markdown) +} + +pub fn format_frontmatter(meta: &Metadata) -> String { + let mut lines = vec!["---".to_string()]; + + if let Some(title) = &meta.title { + lines.push(format!("title: \"{title}\"")); + } + if let Some(author) = &meta.author { + lines.push(format!("author: \"{author}\"")); + } + if let Some(date) = &meta.published_date { + lines.push(format!("date: \"{date}\"")); + } + if let Some(url) = &meta.url { + lines.push(format!("source: \"{url}\"")); + } + if meta.word_count > 0 { + lines.push(format!("word_count: {}", meta.word_count)); + } + + lines.push("---".to_string()); + lines.push(String::new()); // blank line after frontmatter + lines.join("\n") +} + +/// Render a single `ExtractionResult` into a standalone document string for the +/// given format. The Llm format derives its source URL from `metadata.url`. +/// +/// This is the single per-page renderer behind `format_output` and +/// `print_output`. Callers own the iteration and separator framing. +pub fn render_one(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String { + match format { + OutputFormat::Markdown => { + let mut out = String::new(); + if show_metadata { + out.push_str(&format_frontmatter(&result.metadata)); + } + out.push_str(&result.content.markdown); + if !result.structured_data.is_empty() { + out.push_str("\n\n## Structured Data\n\n```json\n"); + out.push_str( + &serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(), + ); + out.push_str("\n```"); + } + out + } + OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"), + OutputFormat::Text => result.content.plain_text.clone(), + OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()), + OutputFormat::Html => raw_html_or_markdown(result).to_string(), + } +} + +/// Format an `ExtractionResult` into a string for the given output format. +pub fn format_output( + result: &ExtractionResult, + format: &OutputFormat, + show_metadata: bool, +) -> String { + render_one(result, format, show_metadata) +} + +pub fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) { + println!("{}", render_one(result, format, show_metadata)); +} + +/// Print cloud API response in the requested format. +pub fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) { + match format { + OutputFormat::Json => { + println!( + "{}", + serde_json::to_string_pretty(resp).expect("serialization failed") + ); + } + OutputFormat::Markdown => { + // Cloud response has content.markdown + if let Some(md) = resp + .get("content") + .and_then(|c| c.get("markdown")) + .and_then(|m| m.as_str()) + { + println!("{md}"); + } else if let Some(md) = resp.get("markdown").and_then(|m| m.as_str()) { + println!("{md}"); + } else { + println!( + "{}", + serde_json::to_string_pretty(resp).expect("serialization failed") + ); + } + } + OutputFormat::Text => { + if let Some(txt) = resp + .get("content") + .and_then(|c| c.get("plain_text")) + .and_then(|t| t.as_str()) + { + println!("{txt}"); + } else { + // Fallback to markdown or raw JSON + print_cloud_output(resp, &OutputFormat::Markdown); + } + } + OutputFormat::Llm => { + if let Some(llm) = resp + .get("content") + .and_then(|c| c.get("llm_text")) + .and_then(|t| t.as_str()) + { + println!("{llm}"); + } else { + print_cloud_output(resp, &OutputFormat::Markdown); + } + } + OutputFormat::Html => { + if let Some(html) = resp + .get("content") + .and_then(|c| c.get("raw_html")) + .and_then(|h| h.as_str()) + { + println!("{html}"); + } else { + print_cloud_output(resp, &OutputFormat::Markdown); + } + } + } +} + +pub fn print_diff_output(diff: &ContentDiff, format: &OutputFormat) { + match format { + OutputFormat::Json => { + println!( + "{}", + serde_json::to_string_pretty(diff).expect("serialization failed") + ); + } + // For markdown/text/llm, show a human-readable summary + _ => { + println!("Status: {:?}", diff.status); + println!("Word count delta: {:+}", diff.word_count_delta); + + if !diff.metadata_changes.is_empty() { + println!("\nMetadata changes:"); + for change in &diff.metadata_changes { + println!( + " {}: {} -> {}", + change.field, + change.old.as_deref().unwrap_or("(none)"), + change.new.as_deref().unwrap_or("(none)"), + ); + } + } + + if !diff.links_added.is_empty() { + println!("\nLinks added:"); + for link in &diff.links_added { + println!(" + {} ({})", link.href, link.text); + } + } + + if !diff.links_removed.is_empty() { + println!("\nLinks removed:"); + for link in &diff.links_removed { + println!(" - {} ({})", link.href, link.text); + } + } + + if let Some(ref text_diff) = diff.text_diff { + println!("\n{text_diff}"); + } + } + } +} + +pub fn print_crawl_output(result: &CrawlResult, format: &OutputFormat, show_metadata: bool) { + match format { + OutputFormat::Json => { + println!( + "{}", + serde_json::to_string_pretty(result).expect("serialization failed") + ); + } + OutputFormat::Markdown => { + for page in &result.pages { + let Some(ref extraction) = page.extraction else { + continue; + }; + println!("---"); + println!("# Page: {}\n", page.url); + if show_metadata { + print!("{}", format_frontmatter(&extraction.metadata)); + } + println!("{}", extraction.content.markdown); + println!(); + } + } + OutputFormat::Text => { + for page in &result.pages { + let Some(ref extraction) = page.extraction else { + continue; + }; + println!("---"); + println!("# Page: {}\n", page.url); + println!("{}", extraction.content.plain_text); + println!(); + } + } + OutputFormat::Llm => { + for page in &result.pages { + let Some(ref extraction) = page.extraction else { + continue; + }; + println!("---"); + println!("{}", to_llm_text(extraction, Some(page.url.as_str()))); + println!(); + } + } + OutputFormat::Html => { + for page in &result.pages { + let Some(ref extraction) = page.extraction else { + continue; + }; + println!("---"); + println!("\n", page.url); + println!("{}", raw_html_or_markdown(extraction)); + println!(); + } + } + } +} + +pub fn print_batch_output( + results: &[BatchExtractResult], + format: &OutputFormat, + show_metadata: bool, +) { + match format { + OutputFormat::Json => { + // Build a JSON array of {url, result?, error?} objects + let entries: Vec = results + .iter() + .map(|r| match &r.result { + Ok(extraction) => serde_json::json!({ + "url": r.url, + "result": extraction, + }), + Err(e) => serde_json::json!({ + "url": r.url, + "error": e.to_string(), + }), + }) + .collect(); + println!( + "{}", + serde_json::to_string_pretty(&entries).expect("serialization failed") + ); + } + OutputFormat::Markdown => { + for r in results { + match &r.result { + Ok(extraction) => { + println!("---"); + println!("# {}\n", r.url); + if show_metadata { + print!("{}", format_frontmatter(&extraction.metadata)); + } + println!("{}", extraction.content.markdown); + println!(); + } + Err(e) => { + eprintln!("error: {} -- {}", r.url, e); + } + } + } + } + OutputFormat::Text => { + for r in results { + match &r.result { + Ok(extraction) => { + println!("---"); + println!("# {}\n", r.url); + println!("{}", extraction.content.plain_text); + println!(); + } + Err(e) => { + eprintln!("error: {} -- {}", r.url, e); + } + } + } + } + OutputFormat::Llm => { + for r in results { + match &r.result { + Ok(extraction) => { + println!("---"); + println!("{}", to_llm_text(extraction, Some(r.url.as_str()))); + println!(); + } + Err(e) => { + eprintln!("error: {} -- {}", r.url, e); + } + } + } + } + OutputFormat::Html => { + for r in results { + match &r.result { + Ok(extraction) => { + println!("---"); + println!("\n", r.url); + println!("{}", raw_html_or_markdown(extraction)); + println!(); + } + Err(e) => { + eprintln!("error: {} -- {}", r.url, e); + } + } + } + } + } +} + +pub fn print_map_output(entries: &[SitemapEntry], format: &OutputFormat) { + match format { + OutputFormat::Json => { + println!( + "{}", + serde_json::to_string_pretty(entries).expect("serialization failed") + ); + } + _ => { + for entry in entries { + println!("{}", entry.url); + } + } + } +} + +/// Format a streaming progress line for a completed page. +pub fn format_progress(page: &PageResult, index: usize, max_pages: usize) -> String { + let status = if page.error.is_some() { "ERR" } else { "OK " }; + let timing = format!("{}ms", page.elapsed.as_millis()); + let detail = if let Some(ref extraction) = page.extraction { + format!(", {} words", extraction.metadata.word_count) + } else if let Some(ref err) = page.error { + format!(" ({err})") + } else { + String::new() + }; + format!( + "[{index}/{max_pages}] {status} {} ({timing}{detail})", + page.url + ) +} diff --git a/crates/webclaw-cli/src/run.rs b/crates/webclaw-cli/src/run.rs new file mode 100644 index 0000000..2305f64 --- /dev/null +++ b/crates/webclaw-cli/src/run.rs @@ -0,0 +1,1014 @@ +//! Async run handlers for every CLI mode: crawl, map, batch, watch, diff, +//! brand, LLM extraction/summarization, and cloud research. + +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; + +use webclaw_core::{ChangeStatus, ExtractionOptions, ExtractionResult}; +use webclaw_fetch::{CrawlConfig, Crawler, FetchClient, PageResult}; +use webclaw_llm::LlmProvider; + +use crate::cli::{Cli, OutputFormat}; +use crate::fetch::{ + EmptyReason, build_extraction_options, build_fetch_config, detect_empty, + enrich_html_with_stylesheets, fetch_and_extract, fetch_html, normalize_url, url_to_filename, + warn_empty, write_to_file, +}; +use crate::output::{ + format_output, format_progress, print_batch_output, print_crawl_output, print_diff_output, + print_map_output, +}; +use crate::webhook::{fire_webhook, spawn_on_change}; + +pub async fn run_crawl(cli: &Cli) -> Result<(), String> { + let url = cli + .urls + .first() + .ok_or("--crawl requires a URL argument") + .map(|u| normalize_url(u))?; + let url = url.as_str(); + + if cli.file.is_some() || cli.stdin { + return Err("--crawl cannot be used with --file or --stdin".into()); + } + + let include_patterns: Vec = cli + .include_paths + .as_deref() + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or_default(); + let exclude_patterns: Vec = cli + .exclude_paths + .as_deref() + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or_default(); + + // Set up streaming progress channel + let (progress_tx, mut progress_rx) = tokio::sync::broadcast::channel::(100); + + // Set up cancel flag for Ctrl+C handling + let cancel_flag = Arc::new(AtomicBool::new(false)); + + // Register Ctrl+C handler when --crawl-state is set + let state_path = cli.crawl_state.clone(); + if state_path.is_some() { + let flag = Arc::clone(&cancel_flag); + tokio::spawn(async move { + tokio::signal::ctrl_c().await.ok(); + flag.store(true, Ordering::Relaxed); + eprintln!("\nCtrl+C received, saving crawl state..."); + }); + } + + let config = CrawlConfig { + fetch: build_fetch_config(cli), + max_depth: cli.depth, + max_pages: cli.max_pages, + concurrency: cli.concurrency, + delay: std::time::Duration::from_millis(cli.delay), + path_prefix: cli.path_prefix.clone(), + use_sitemap: cli.sitemap, + include_patterns, + exclude_patterns, + progress_tx: Some(progress_tx), + cancel_flag: Some(Arc::clone(&cancel_flag)), + allow_subdomains: false, + allow_external_links: false, + }; + + // Load resume state if --crawl-state file exists + let resume_state = state_path + .as_ref() + .and_then(|p| Crawler::load_state(p)) + .inspect(|s| { + eprintln!( + "Resuming crawl: {} pages already visited, {} URLs in frontier", + s.visited.len(), + s.frontier.len(), + ); + }); + + let max_pages = cli.max_pages; + let completed_offset = resume_state.as_ref().map_or(0, |s| s.completed_pages); + + // Spawn background task to print streaming progress to stderr + let progress_handle = tokio::spawn(async move { + let mut count = completed_offset; + while let Ok(page) = progress_rx.recv().await { + count += 1; + eprintln!("{}", format_progress(&page, count, max_pages)); + } + }); + + let crawler = Crawler::new(url, config).map_err(|e| format!("crawler error: {e}"))?; + let result = crawler.crawl(url, resume_state).await; + + // Drop the crawler (and its progress_tx clone) so the progress task finishes + drop(crawler); + let _ = progress_handle.await; + + // If cancelled via Ctrl+C and --crawl-state is set, save state for resume + let was_cancelled = cancel_flag.load(Ordering::Relaxed); + if was_cancelled { + if let Some(ref path) = state_path { + Crawler::save_state( + path, + url, + &result.visited, + &result.remaining_frontier, + completed_offset + result.pages.len(), + cli.max_pages, + cli.depth, + )?; + eprintln!( + "Crawl state saved to {} ({} pages completed). Resume with --crawl-state {}", + path.display(), + completed_offset + result.pages.len(), + path.display(), + ); + } + } else if let Some(ref path) = state_path { + // Crawl completed normally — clean up state file + if path.exists() { + let _ = std::fs::remove_file(path); + } + } + + // Log per-page errors and extraction warnings to stderr + for page in &result.pages { + if let Some(ref err) = page.error { + eprintln!("error: {} -- {}", page.url, err); + } else if let Some(ref extraction) = page.extraction { + let reason = detect_empty(extraction); + if !matches!(reason, EmptyReason::None) { + warn_empty(&page.url, &reason); + } + } + } + + if let Some(ref dir) = cli.output_dir { + let mut saved = 0usize; + for page in &result.pages { + if let Some(ref extraction) = page.extraction { + let filename = url_to_filename(&page.url, &cli.format); + let content = format_output(extraction, &cli.format, cli.metadata); + write_to_file(dir, &filename, &content)?; + saved += 1; + } + } + eprintln!("Saved {saved} files to {}", dir.display()); + } else { + print_crawl_output(&result, &cli.format, cli.metadata); + } + + eprintln!( + "Crawled {} pages ({} ok, {} errors) in {:.1}s", + result.total, result.ok, result.errors, result.elapsed_secs, + ); + + // Fire webhook on crawl complete + if let Some(ref webhook_url) = cli.webhook { + let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect(); + fire_webhook( + webhook_url, + &serde_json::json!({ + "event": "crawl_complete", + "total": result.total, + "ok": result.ok, + "errors": result.errors, + "elapsed_secs": result.elapsed_secs, + "urls": urls, + }), + ); + // Brief pause so the async webhook has time to fire + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + } + + if result.errors > 0 { + Err(format!( + "{} of {} pages failed", + result.errors, result.total + )) + } else { + Ok(()) + } +} + +pub async fn run_map(cli: &Cli) -> Result<(), String> { + let url = cli + .urls + .first() + .ok_or("--map requires a URL argument") + .map(|u| normalize_url(u))?; + let url = url.as_str(); + + let client = + FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; + + let entries = webclaw_fetch::sitemap::discover(&client, url) + .await + .map_err(|e| format!("sitemap discovery failed: {e}"))?; + + if entries.is_empty() { + eprintln!("no sitemap URLs found for {url}"); + } else { + eprintln!("discovered {} URLs", entries.len()); + } + + print_map_output(&entries, &cli.format); + Ok(()) +} + +pub async fn run_batch(cli: &Cli, entries: &[(String, Option)]) -> Result<(), String> { + let client = Arc::new( + FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?, + ); + + let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect(); + let options = build_extraction_options(cli); + let results = client + .fetch_and_extract_batch_with_options(&urls, cli.concurrency, &options) + .await; + + let ok = results.iter().filter(|r| r.result.is_ok()).count(); + let errors = results.len() - ok; + + // Log errors and extraction warnings to stderr + for r in &results { + if let Err(ref e) = r.result { + eprintln!("error: {} -- {}", r.url, e); + } else if let Ok(ref extraction) = r.result { + let reason = detect_empty(extraction); + if !matches!(reason, EmptyReason::None) { + warn_empty(&r.url, &reason); + } + } + } + + // Build a lookup of custom filenames by URL + let custom_names: std::collections::HashMap<&str, &str> = entries + .iter() + .filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n))) + .collect(); + + if let Some(ref dir) = cli.output_dir { + let mut saved = 0usize; + for r in &results { + if let Ok(ref extraction) = r.result { + let filename = custom_names + .get(r.url.as_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| url_to_filename(&r.url, &cli.format)); + let content = format_output(extraction, &cli.format, cli.metadata); + write_to_file(dir, &filename, &content)?; + saved += 1; + } + } + eprintln!("Saved {saved} files to {}", dir.display()); + } else { + print_batch_output(&results, &cli.format, cli.metadata); + } + + eprintln!( + "Fetched {} URLs ({} ok, {} errors)", + results.len(), + ok, + errors + ); + + // Fire webhook on batch complete + if let Some(ref webhook_url) = cli.webhook { + let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect(); + fire_webhook( + webhook_url, + &serde_json::json!({ + "event": "batch_complete", + "total": results.len(), + "ok": ok, + "errors": errors, + "urls": urls, + }), + ); + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + } + + if errors > 0 { + Err(format!("{errors} of {} URLs failed", results.len())) + } else { + Ok(()) + } +} + +fn timestamp() -> String { + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + let hours = (now % 86400) / 3600; + let minutes = (now % 3600) / 60; + let seconds = now % 60; + format!("{hours:02}:{minutes:02}:{seconds:02}") +} + +pub async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> { + if urls.is_empty() { + return Err("--watch requires at least one URL".into()); + } + + let client = Arc::new( + FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?, + ); + let options = build_extraction_options(cli); + + // Ctrl+C handler + let cancelled = Arc::new(AtomicBool::new(false)); + let flag = Arc::clone(&cancelled); + tokio::spawn(async move { + tokio::signal::ctrl_c().await.ok(); + flag.store(true, Ordering::Relaxed); + }); + + // Single-URL mode: preserve original behavior exactly + if urls.len() == 1 { + return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await; + } + + // Multi-URL mode: batch fetch, diff each, report aggregate + run_watch_multi(cli, &client, &options, urls, &cancelled).await +} + +/// Original single-URL watch loop -- backward compatible. +async fn run_watch_single( + cli: &Cli, + client: &Arc, + options: &ExtractionOptions, + url: &str, + cancelled: &Arc, +) -> Result<(), String> { + let mut previous = client + .fetch_and_extract_with_options(url, options) + .await + .map_err(|e| format!("initial fetch failed: {e}"))?; + + eprintln!( + "[watch] Initial snapshot: {url} ({} words)", + previous.metadata.word_count + ); + + loop { + // Clamp to >=1s: `--watch-interval 0` would otherwise spin the + // fetch loop with zero delay and hammer the target. + tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await; + + if cancelled.load(Ordering::Relaxed) { + eprintln!("[watch] Stopped"); + break; + } + + let current = match client.fetch_and_extract_with_options(url, options).await { + Ok(result) => result, + Err(e) => { + eprintln!("[watch] Fetch error ({}): {e}", timestamp()); + continue; + } + }; + + let diff = webclaw_core::diff::diff(&previous, ¤t); + + if diff.status == ChangeStatus::Same { + eprintln!("[watch] No changes ({})", timestamp()); + } else { + print_diff_output(&diff, &cli.format); + eprintln!("[watch] Changes detected! ({})", timestamp()); + + if let Some(ref cmd) = cli.on_change { + let diff_json = serde_json::to_string(&diff).unwrap_or_default(); + spawn_on_change(cmd, diff_json.as_bytes()).await; + } + + if let Some(ref webhook_url) = cli.webhook { + fire_webhook( + webhook_url, + &serde_json::json!({ + "event": "watch_change", + "url": url, + "status": format!("{:?}", diff.status), + "word_count_delta": diff.word_count_delta, + "metadata_changes": diff.metadata_changes.len(), + "links_added": diff.links_added.len(), + "links_removed": diff.links_removed.len(), + }), + ); + } + + previous = current; + } + } + + Ok(()) +} + +/// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate. +async fn run_watch_multi( + cli: &Cli, + client: &Arc, + options: &ExtractionOptions, + urls: &[String], + cancelled: &Arc, +) -> Result<(), String> { + let url_refs: Vec<&str> = urls.iter().map(|u| u.as_str()).collect(); + + // Initial pass: fetch all URLs in parallel + let initial_results = client + .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options) + .await; + + let mut snapshots = std::collections::HashMap::new(); + let mut ok_count = 0usize; + let mut err_count = 0usize; + + for r in initial_results { + match r.result { + Ok(extraction) => { + snapshots.insert(r.url, extraction); + ok_count += 1; + } + Err(e) => { + eprintln!("[watch] Initial fetch error: {} -- {e}", r.url); + err_count += 1; + } + } + } + + eprintln!( + "[watch] Watching {} URLs (interval: {}s)", + urls.len(), + cli.watch_interval + ); + eprintln!("[watch] Initial snapshots: {ok_count} ok, {err_count} errors"); + + let mut check_number = 0u64; + + loop { + // Clamp to >=1s: `--watch-interval 0` would otherwise spin the + // fetch loop with zero delay and hammer the target. + tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await; + + if cancelled.load(Ordering::Relaxed) { + eprintln!("[watch] Stopped"); + break; + } + + check_number += 1; + + let current_results = client + .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options) + .await; + + let mut changed: Vec = Vec::new(); + let mut same_count = 0usize; + let mut fetch_errors = 0usize; + + for r in current_results { + match r.result { + Ok(current) => { + if let Some(previous) = snapshots.get(&r.url) { + let diff = webclaw_core::diff::diff(previous, ¤t); + if diff.status == ChangeStatus::Same { + same_count += 1; + } else { + changed.push(serde_json::json!({ + "url": r.url, + "word_count_delta": diff.word_count_delta, + })); + snapshots.insert(r.url, current); + } + } else { + // URL failed initially, first successful fetch -- store as baseline + snapshots.insert(r.url, current); + same_count += 1; + } + } + Err(e) => { + eprintln!("[watch] Fetch error: {} -- {e}", r.url); + fetch_errors += 1; + } + } + } + + let ts = timestamp(); + let err_suffix = if fetch_errors > 0 { + format!(", {fetch_errors} errors") + } else { + String::new() + }; + + if changed.is_empty() { + eprintln!( + "[watch] Check {check_number} ({ts}): 0 changed, {same_count} same{err_suffix}" + ); + } else { + eprintln!( + "[watch] Check {check_number} ({ts}): {} changed, {same_count} same{err_suffix}", + changed.len(), + ); + for entry in &changed { + let url = entry["url"].as_str().unwrap_or("?"); + let delta = entry["word_count_delta"].as_i64().unwrap_or(0); + eprintln!(" -> {url} (word delta: {delta:+})"); + } + + // Fire --on-change once with all changes + if let Some(ref cmd) = cli.on_change { + let payload = serde_json::json!({ + "event": "watch_changes", + "check_number": check_number, + "total_urls": urls.len(), + "changed": changed.len(), + "same": same_count, + "changes": changed, + }); + let payload_json = serde_json::to_string(&payload).unwrap_or_default(); + spawn_on_change(cmd, payload_json.as_bytes()).await; + } + + // Fire webhook once with aggregate payload + if let Some(ref webhook_url) = cli.webhook { + fire_webhook( + webhook_url, + &serde_json::json!({ + "event": "watch_changes", + "check_number": check_number, + "total_urls": urls.len(), + "changed": changed.len(), + "same": same_count, + "changes": changed, + }), + ); + } + } + } + + Ok(()) +} + +pub async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> { + // Load previous snapshot + let snapshot_json = std::fs::read_to_string(snapshot_path) + .map_err(|e| format!("failed to read snapshot {snapshot_path}: {e}"))?; + let old: ExtractionResult = serde_json::from_str(&snapshot_json) + .map_err(|e| format!("failed to parse snapshot JSON: {e}"))?; + + // Extract current version (handles PDF detection for URLs) + let new_result = fetch_and_extract(cli).await?.into_extraction()?; + + let diff = webclaw_core::diff::diff(&old, &new_result); + print_diff_output(&diff, &cli.format); + + Ok(()) +} + +pub async fn run_brand(cli: &Cli) -> Result<(), String> { + let result = fetch_html(cli).await?; + let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await; + let brand = webclaw_core::brand::extract_brand( + &enriched, + Some(result.url.as_str()).filter(|s| !s.is_empty()), + ); + println!( + "{}", + serde_json::to_string_pretty(&brand).expect("serialization failed") + ); + Ok(()) +} + +/// Build an LLM provider based on CLI flags, or fall back to the default chain. +async fn build_llm_provider(cli: &Cli) -> Result, String> { + if let Some(ref name) = cli.llm_provider { + match name.as_str() { + "ollama" => { + let provider = webclaw_llm::providers::ollama::OllamaProvider::new( + cli.llm_base_url.clone(), + cli.llm_model.clone(), + ); + if !provider.is_available().await { + return Err("ollama is not running or unreachable".into()); + } + Ok(Box::new(provider)) + } + "openai" => { + let provider = webclaw_llm::providers::openai::OpenAiProvider::new( + None, + cli.llm_base_url.clone(), + cli.llm_model.clone(), + ) + .ok_or("OPENAI_API_KEY not set")?; + Ok(Box::new(provider)) + } + "anthropic" => { + let provider = webclaw_llm::providers::anthropic::AnthropicProvider::with_base_url( + None, + cli.llm_base_url.clone(), + cli.llm_model.clone(), + ) + .ok_or("ANTHROPIC_API_KEY not set")?; + Ok(Box::new(provider)) + } + other => Err(format!( + "unknown LLM provider: {other} (use ollama, openai, or anthropic)" + )), + } + } else { + let chain = webclaw_llm::ProviderChain::default().await; + if chain.is_empty() { + return Err( + "no LLM providers available -- start Ollama or set OPENAI_API_KEY / ANTHROPIC_API_KEY" + .into(), + ); + } + Ok(Box::new(chain)) + } +} + +pub async fn run_llm(cli: &Cli) -> Result<(), String> { + // Extract content from source first (handles PDF detection for URLs) + let result = fetch_and_extract(cli).await?.into_extraction()?; + + let provider = build_llm_provider(cli).await?; + let model = cli.llm_model.as_deref(); + + if let Some(ref schema_input) = cli.extract_json { + // Support @file syntax for loading schema from file + let schema_str = if let Some(path) = schema_input.strip_prefix('@') { + std::fs::read_to_string(path) + .map_err(|e| format!("failed to read schema file {path}: {e}"))? + } else { + schema_input.clone() + }; + + let schema: serde_json::Value = + serde_json::from_str(&schema_str).map_err(|e| format!("invalid JSON schema: {e}"))?; + + let extracted = webclaw_llm::extract::extract_json( + &result.content.plain_text, + &schema, + provider.as_ref(), + model, + ) + .await + .map_err(|e| format!("LLM extraction failed: {e}"))?; + + println!( + "{}", + serde_json::to_string_pretty(&extracted).expect("serialization failed") + ); + } else if let Some(ref prompt) = cli.extract_prompt { + let extracted = webclaw_llm::extract::extract_with_prompt( + &result.content.plain_text, + prompt, + provider.as_ref(), + model, + ) + .await + .map_err(|e| format!("LLM extraction failed: {e}"))?; + + println!( + "{}", + serde_json::to_string_pretty(&extracted).expect("serialization failed") + ); + } else if let Some(sentences) = cli.summarize { + let summary = webclaw_llm::summarize::summarize( + &result.content.plain_text, + Some(sentences), + provider.as_ref(), + model, + ) + .await + .map_err(|e| format!("LLM summarization failed: {e}"))?; + + println!("{summary}"); + } + + Ok(()) +} + +/// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results. +/// URLs are processed sequentially to respect LLM provider rate limits. +pub async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Result<(), String> { + let client = + FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; + let options = build_extraction_options(cli); + let provider = build_llm_provider(cli).await?; + let model = cli.llm_model.as_deref(); + + // Pre-parse schema once if --extract-json is used + let schema = if let Some(ref schema_input) = cli.extract_json { + let schema_str = if let Some(path) = schema_input.strip_prefix('@') { + std::fs::read_to_string(path) + .map_err(|e| format!("failed to read schema file {path}: {e}"))? + } else { + schema_input.clone() + }; + Some( + serde_json::from_str::(&schema_str) + .map_err(|e| format!("invalid JSON schema: {e}"))?, + ) + } else { + None + }; + + // Build custom filename lookup from entries + let custom_names: std::collections::HashMap<&str, &str> = entries + .iter() + .filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n))) + .collect(); + + let total = entries.len(); + let mut ok = 0usize; + let mut errors = 0usize; + let mut all_results: Vec = Vec::with_capacity(total); + + for (i, (url, _)) in entries.iter().enumerate() { + let idx = i + 1; + eprint!("[{idx}/{total}] {url} "); + + // Fetch and extract page content + let extraction = match client.fetch_and_extract_with_options(url, &options).await { + Ok(r) => r, + Err(e) => { + errors += 1; + let msg = format!("fetch failed: {e}"); + eprintln!("-> error: {msg}"); + all_results.push(serde_json::json!({ "url": url, "error": msg })); + continue; + } + }; + + let text = &extraction.content.plain_text; + + // Run the appropriate LLM operation + let llm_result = if let Some(ref schema) = schema { + webclaw_llm::extract::extract_json(text, schema, provider.as_ref(), model) + .await + .map(LlmOutput::Json) + } else if let Some(ref prompt) = cli.extract_prompt { + webclaw_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model) + .await + .map(LlmOutput::Json) + } else if let Some(sentences) = cli.summarize { + webclaw_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model) + .await + .map(LlmOutput::Text) + } else { + unreachable!("run_batch_llm called without LLM flags") + }; + + match llm_result { + Ok(output) => { + ok += 1; + + let (output_str, result_json) = match &output { + LlmOutput::Json(v) => { + let s = serde_json::to_string_pretty(v).expect("serialization failed"); + let j = serde_json::json!({ "url": url, "result": v }); + (s, j) + } + LlmOutput::Text(s) => { + let j = serde_json::json!({ "url": url, "result": s }); + (s.clone(), j) + } + }; + + // Count top-level fields/items for progress display + let detail = match &output { + LlmOutput::Json(v) => match v { + serde_json::Value::Object(m) => format!("{} fields", m.len()), + serde_json::Value::Array(a) => format!("{} items", a.len()), + _ => "done".to_string(), + }, + LlmOutput::Text(s) => { + let words = s.split_whitespace().count(); + format!("{words} words") + } + }; + eprintln!("-> extracted {detail}"); + + if let Some(ref dir) = cli.output_dir { + let filename = custom_names + .get(url.as_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| url_to_filename(url, &OutputFormat::Json)); + write_to_file(dir, &filename, &output_str)?; + } else { + println!("--- {url}"); + println!("{output_str}"); + println!(); + } + + all_results.push(result_json); + } + Err(e) => { + errors += 1; + let msg = format!("LLM extraction failed: {e}"); + eprintln!("-> error: {msg}"); + all_results.push(serde_json::json!({ "url": url, "error": msg })); + } + } + } + + eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)"); + + if let Some(ref webhook_url) = cli.webhook { + fire_webhook( + webhook_url, + &serde_json::json!({ + "event": "batch_llm_complete", + "total": total, + "ok": ok, + "errors": errors, + }), + ); + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + } + + if errors > 0 { + Err(format!("{errors} of {total} URLs failed")) + } else { + Ok(()) + } +} + +/// Intermediate type to hold LLM output before formatting. +enum LlmOutput { + Json(serde_json::Value), + Text(String), +} + +/// Returns true if any LLM flag is set. +pub fn has_llm_flags(cli: &Cli) -> bool { + cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some() +} + +pub async fn run_research(cli: &Cli, query: &str) -> Result<(), String> { + let api_key = cli + .api_key + .as_deref() + .ok_or("--research requires WEBCLAW_API_KEY (set via env or --api-key)")?; + + let client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(600)) + .build() + .map_err(|e| format!("http client error: {e}"))?; + + let mut body = serde_json::json!({ "query": query }); + if cli.deep { + body["deep"] = serde_json::json!(true); + } + + eprintln!("Starting research: {query}"); + if cli.deep { + eprintln!("Deep mode enabled (longer, more thorough)"); + } + + // Start job + let resp = client + .post("https://api.webclaw.io/v1/research") + .header("Authorization", format!("Bearer {api_key}")) + .json(&body) + .send() + .await + .map_err(|e| format!("API error: {e}"))? + .json::() + .await + .map_err(|e| format!("parse error: {e}"))?; + + let job_id = resp + .get("id") + .and_then(|v| v.as_str()) + .ok_or("API did not return a job ID")? + .to_string(); + + eprintln!("Job started: {job_id}"); + + // Poll + for poll in 0..200 { + tokio::time::sleep(std::time::Duration::from_secs(3)).await; + + let status_resp = client + .get(format!("https://api.webclaw.io/v1/research/{job_id}")) + .header("Authorization", format!("Bearer {api_key}")) + .send() + .await + .map_err(|e| format!("poll error: {e}"))? + .json::() + .await + .map_err(|e| format!("parse error: {e}"))?; + + let status = status_resp + .get("status") + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); + + match status { + "completed" => { + let report = status_resp + .get("report") + .and_then(|v| v.as_str()) + .unwrap_or(""); + + // Save full result to JSON file + let slug: String = query + .chars() + .map(|c| { + if c.is_alphanumeric() || c == ' ' { + c + } else { + ' ' + } + }) + .collect::() + .split_whitespace() + .collect::>() + .join("-") + .to_lowercase(); + // char-safe truncation: byte slicing panics if char 50 + // lands mid-codepoint (multibyte queries). + let slug: String = slug.chars().take(50).collect(); + let filename = format!("research-{slug}.json"); + + let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default(); + std::fs::write(&filename, &json) + .map_err(|e| format!("failed to write {filename}: {e}"))?; + + let elapsed = status_resp + .get("elapsed_ms") + .and_then(|v| v.as_i64()) + .unwrap_or(0); + let sources = status_resp + .get("sources_count") + .and_then(|v| v.as_i64()) + .unwrap_or(0); + let findings = status_resp + .get("findings_count") + .and_then(|v| v.as_i64()) + .unwrap_or(0); + + eprintln!( + "Research complete: {sources} sources, {findings} findings, {:.1}s", + elapsed as f64 / 1000.0 + ); + eprintln!("Saved to: {filename}"); + + // Print report to stdout + if !report.is_empty() { + println!("{report}"); + } + + return Ok(()); + } + "failed" => { + let error = status_resp + .get("error") + .and_then(|v| v.as_str()) + .unwrap_or("unknown error"); + return Err(format!("Research failed: {error}")); + } + _ => { + if poll % 10 == 9 { + eprintln!("Still researching... ({:.0}s)", (poll + 1) as f64 * 3.0); + } + } + } + } + + Err(format!( + "Research timed out after ~10 minutes. Check status: GET /v1/research/{job_id}" + )) +} + +#[cfg(test)] +mod tests { + #[test] + fn research_slug_truncation_is_char_safe() { + // Multibyte query: byte-slicing at 50 would panic mid-codepoint. + let query = "日本語".repeat(40); // 120 chars, 3 bytes each + let slug: String = query + .chars() + .map(|c| { + if c.is_alphanumeric() || c == ' ' { + c + } else { + ' ' + } + }) + .collect::() + .split_whitespace() + .collect::>() + .join("-") + .to_lowercase(); + let slug: String = slug.chars().take(50).collect(); + assert!(slug.chars().count() <= 50); + // Round-trips through formatting without panicking. + let _ = format!("research-{slug}.json"); + } +} diff --git a/crates/webclaw-cli/src/webhook.rs b/crates/webclaw-cli/src/webhook.rs new file mode 100644 index 0000000..e3f69ac --- /dev/null +++ b/crates/webclaw-cli/src/webhook.rs @@ -0,0 +1,121 @@ +//! Webhook delivery and `--on-change` command execution. + +/// Spawn the `--on-change` command with `payload` on stdin. +/// +/// Previously this passed the entire user-provided string to `sh -c`, which +/// made `--on-change 'notify "$URL"; rm -rf /'` a plausible disaster the +/// moment an untrusted config file or MCP-driven agent fed us a command. +/// The MCP surface specifically is prompt-injection-exposed: an LLM that +/// controls CLI args can escalate into arbitrary shell on the host. +/// +/// We now parse the command with `shlex` (POSIX-ish tokenization with proper +/// quoting) and exec the program directly without an intermediate shell, so +/// metacharacters like `;`, `&&`, `|`, `$()`, and env expansion can't fire. +/// Users who genuinely need a pipeline can set the whole chain behind a +/// script they've written, or opt in per-call via `WEBCLAW_ALLOW_SHELL=1` +/// (documented escape hatch, noisy by design). +pub async fn spawn_on_change(cmd: &str, stdin_payload: &[u8]) { + eprintln!("[watch] Running: {cmd}"); + + let allow_shell = std::env::var("WEBCLAW_ALLOW_SHELL") + .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) + .unwrap_or(false); + + let mut command = if allow_shell { + eprintln!("[watch] WEBCLAW_ALLOW_SHELL=1 — executing via sh -c (unsafe)"); + let mut c = tokio::process::Command::new("sh"); + c.arg("-c").arg(cmd); + c + } else { + let Some(argv) = shlex::split(cmd) else { + eprintln!("[watch] Failed to parse --on-change command (unbalanced quotes?)"); + return; + }; + let Some((program, args)) = argv.split_first() else { + eprintln!("[watch] --on-change command is empty"); + return; + }; + let mut c = tokio::process::Command::new(program); + c.args(args); + c + }; + + command.stdin(std::process::Stdio::piped()); + + match command.spawn() { + Ok(mut child) => { + if let Some(mut stdin) = child.stdin.take() { + use tokio::io::AsyncWriteExt; + let _ = stdin.write_all(stdin_payload).await; + } + } + Err(e) => eprintln!("[watch] Failed to run command: {e}"), + } +} + +/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr. +/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly. +pub fn fire_webhook(url: &str, payload: &serde_json::Value) { + let url = url.to_string(); + let is_discord = url.contains("discord.com/api/webhooks"); + let is_slack = url.contains("hooks.slack.com"); + + let body = if is_discord { + let event = payload + .get("event") + .and_then(|v| v.as_str()) + .unwrap_or("notification"); + let details = serde_json::to_string_pretty(payload).unwrap_or_default(); + serde_json::json!({ + "embeds": [{ + "title": format!("webclaw: {event}"), + "description": format!("```json\n{details}\n```"), + "color": 5814783 + }] + }) + .to_string() + } else if is_slack { + let event = payload + .get("event") + .and_then(|v| v.as_str()) + .unwrap_or("notification"); + let details = serde_json::to_string_pretty(payload).unwrap_or_default(); + serde_json::json!({ + "text": format!("*webclaw: {event}*\n```{details}```") + }) + .to_string() + } else { + serde_json::to_string(payload).unwrap_or_default() + }; + tokio::spawn(async move { + // SSRF guard: a webhook URL is user-supplied and otherwise bypasses + // the fetch-layer protections, so resolve + reject private/internal + // destinations before sending the payload. + if let Err(e) = webclaw_fetch::url_security::validate_public_http_url(&url).await { + eprintln!("[webhook] refusing unsafe URL: {e}"); + return; + } + match reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(10)) + .build() + { + Ok(c) => match c + .post(&url) + .header("Content-Type", "application/json") + .body(body) + .send() + .await + { + Ok(resp) => { + eprintln!( + "[webhook] POST {} -> {}", + &url[..url.len().min(60)], + resp.status() + ); + } + Err(e) => eprintln!("[webhook] POST failed: {e}"), + }, + Err(e) => eprintln!("[webhook] client error: {e}"), + } + }); +} diff --git a/crates/webclaw-core/Cargo.toml b/crates/webclaw-core/Cargo.toml index 5c2743a..dbd505c 100644 --- a/crates/webclaw-core/Cargo.toml +++ b/crates/webclaw-core/Cargo.toml @@ -3,12 +3,16 @@ name = "webclaw-core" description = "Pure HTML content extraction engine for LLMs" version.workspace = true edition.workspace = true +rust-version.workspace = true license.workspace = true # Reddit regression fixtures are real old.reddit.com pages read at test time; # they're large and only needed to run the test suite from the repo, so keep # them out of the published crate. exclude = ["testdata/reddit/*.html"] +[lints] +workspace = true + [features] default = ["quickjs"] quickjs = ["rquickjs"] diff --git a/crates/webclaw-core/src/domain.rs b/crates/webclaw-core/src/domain.rs index 1b5d6eb..eaa5d19 100644 --- a/crates/webclaw-core/src/domain.rs +++ b/crates/webclaw-core/src/domain.rs @@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[serde(rename_all = "snake_case")] +#[non_exhaustive] pub enum DomainType { Article, Documentation, diff --git a/crates/webclaw-core/src/error.rs b/crates/webclaw-core/src/error.rs index d6bb9dc..4f28e55 100644 --- a/crates/webclaw-core/src/error.rs +++ b/crates/webclaw-core/src/error.rs @@ -3,6 +3,7 @@ use thiserror::Error; #[derive(Debug, Error)] +#[non_exhaustive] pub enum ExtractError { #[error("failed to parse HTML")] ParseError, diff --git a/crates/webclaw-core/src/js_eval.rs b/crates/webclaw-core/src/js_eval.rs index e1fb2de..2f78246 100644 --- a/crates/webclaw-core/src/js_eval.rs +++ b/crates/webclaw-core/src/js_eval.rs @@ -16,6 +16,29 @@ static SCRIPT_SELECTOR: Lazy = Lazy::new(|| Selector::parse("script"). static HTML_TAG_RE: Lazy = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap()); const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250); +/// Markers that, if absent from the HTML, prove the QuickJS scan cannot find +/// any data blob. The scan only ever surfaces `globalThis.__*` object/array +/// properties, and the seeded `__next_f` only emits when non-empty. Every +/// realistic way an inline script populates such a global goes through one of +/// these substrings (`window.`/`self.__next` assignments, or the +/// `__NEXT_DATA__`/`__NUXT__`/`application/json` payload conventions). If none +/// are present, running the VM is guaranteed to return zero blobs, so skipping +/// it is output-neutral. Conservative by design: any of these may appear in +/// non-script HTML too, which only makes us skip *less* often, never more. +const JS_CANDIDATE_MARKERS: [&str; 5] = [ + "window.", + "__NEXT_DATA__", + "__NUXT__", + "application/json", + "self.__next", +]; + +/// Returns true if the HTML plausibly contains JS-assigned data the QuickJS +/// scan could surface. When false, the VM is provably a no-op and is skipped. +pub fn has_js_candidate_data(html: &str) -> bool { + JS_CANDIDATE_MARKERS.iter().any(|m| html.contains(m)) +} + /// A blob of data extracted from JS execution. pub struct JsDataBlob { pub name: String, @@ -24,9 +47,17 @@ pub struct JsDataBlob { } /// Execute inline `