feat(cli): add webclaw bench <url> subcommand (closes #26)

Per-URL extraction micro-benchmark. Fetches a URL once, runs the same pipeline as --format llm, prints a small ASCII table comparing raw HTML vs. llm output on tokens, bytes, and extraction time. webclaw bench https://stripe.com # ASCII table webclaw bench https://stripe.com --json # one-line JSON webclaw bench https://stripe.com --facts FILE # adds fidelity row The --facts file uses the same schema as benchmarks/facts.json (curated visible-fact list per URL). URLs not in the file produce no fidelity row, so an uncurated site doesn't show 0/0. v1 uses an approximate tokenizer (chars/4 Latin, chars/2 when CJK dominates). Off by ~10% vs cl100k_base but the signal — 'is the LLM output 90% smaller than the raw HTML' — is order-of-magnitude, not precise accounting. Output is labeled '~ tokens' so nobody mistakes it for a real BPE count. Swapping in tiktoken-rs later is a one function change; left out of v1 to avoid the 2 MB BPE-data binary bloat for a feature most users will run a handful of times. Implemented as a real clap subcommand (clap::Subcommand) rather than yet another flag, with the existing flag-based flow falling through when no subcommand is given. Existing 'webclaw <url> --format ...' invocations work exactly as before. Lays the groundwork for future subcommands without disrupting the legacy flat-flag UX. 12 new unit tests cover the tokenizer, formatters, host extraction, and fact-matching. Verified end-to-end on example.com and tavily.com (5/5 facts preserved at 93% token reduction).
2026-04-25 00:06:21 +02:00 · 2026-04-22 12:25:29 +02:00 · 2026-04-22 12:25:29 +02:00 · d91ad9c1f4
commit d91ad9c1f4
parent 2ba682adf3
2 changed files with 471 additions and 1 deletions
--- a/crates/webclaw-cli/src/bench.rs
+++ b/crates/webclaw-cli/src/bench.rs
@ -0,0 +1,422 @@
+//! `webclaw bench <url>` — per-URL extraction micro-benchmark.
+//!
+//! Fetches a page, extracts it via the same pipeline that powers
+//! `--format llm`, and reports how many tokens the LLM pipeline
+//! removed vs. the raw HTML. Optional `--facts` reuses the
+//! benchmark harness's curated fact lists to score fidelity.
+//!
+//! v1 uses an *approximate* tokenizer (chars/4 for Latin text,
+//! chars/2 for CJK-heavy text). Output is clearly labeled
+//! "≈ tokens" so nobody mistakes it for a real tiktoken run.
+//! Swapping to tiktoken-rs later is a one-function change.
+
+use std::path::{Path, PathBuf};
+use std::time::Instant;
+
+use webclaw_core::{extract, to_llm_text};
+use webclaw_fetch::{BrowserProfile, FetchClient, FetchConfig};
+
+/// Inputs collected from the clap subcommand.
+pub struct BenchArgs {
+    pub url: String,
+    pub json: bool,
+    pub facts: Option<PathBuf>,
+}
+
+/// What a single bench run measures.
+struct BenchResult {
+    url: String,
+    raw_tokens: usize,
+    raw_bytes: usize,
+    llm_tokens: usize,
+    llm_bytes: usize,
+    reduction_pct: f64,
+    elapsed_secs: f64,
+    /// `Some((found, total))` when `--facts` is supplied and the URL has
+    /// an entry in the facts file; `None` otherwise.
+    facts: Option<(usize, usize)>,
+}
+
+pub async fn run(args: &BenchArgs) -> Result<(), String> {
+    // Dedicated client so bench doesn't care about global CLI flags
+    // (proxies, custom headers, etc.). A reproducible microbench is
+    // more useful than an over-configurable one; if someone wants to
+    // bench behind a proxy they can set WEBCLAW_PROXY — respected
+    // by FetchConfig via the regular channels if we extend later.
+    let config = FetchConfig {
+        browser: BrowserProfile::Chrome,
+        ..FetchConfig::default()
+    };
+    let client = FetchClient::new(config).map_err(|e| format!("build client: {e}"))?;
+
+    let start = Instant::now();
+    let fetched = client
+        .fetch(&args.url)
+        .await
+        .map_err(|e| format!("fetch: {e}"))?;
+
+    let extraction =
+        extract(&fetched.html, Some(&fetched.url)).map_err(|e| format!("extract: {e}"))?;
+    let llm_text = to_llm_text(&extraction, Some(&fetched.url));
+    let elapsed = start.elapsed();
+
+    let raw_tokens = approx_tokens(&fetched.html);
+    let llm_tokens = approx_tokens(&llm_text);
+    let raw_bytes = fetched.html.len();
+    let llm_bytes = llm_text.len();
+    let reduction_pct = if raw_tokens == 0 {
+        0.0
+    } else {
+        100.0 * (1.0 - llm_tokens as f64 / raw_tokens as f64)
+    };
+
+    let facts = match args.facts.as_deref() {
+        Some(path) => check_facts(path, &args.url, &llm_text)?,
+        None => None,
+    };
+
+    let result = BenchResult {
+        url: args.url.clone(),
+        raw_tokens,
+        raw_bytes,
+        llm_tokens,
+        llm_bytes,
+        reduction_pct,
+        elapsed_secs: elapsed.as_secs_f64(),
+        facts,
+    };
+
+    if args.json {
+        print_json(&result);
+    } else {
+        print_box(&result);
+    }
+    Ok(())
+}
+
+// ---------------------------------------------------------------------------
+// Approximate tokenizer
+// ---------------------------------------------------------------------------
+
+/// Rough token count. `chars / 4` is the classic English rule of thumb
+/// (close to cl100k_base for typical prose). CJK scripts pack ~2 chars
+/// per token, so we switch to `chars / 2` when CJK dominates.
+///
+/// Off by ±10% vs. a real BPE tokenizer, which is fine for "is webclaw's
+/// output 66% smaller or 66% bigger than raw HTML" — the signal is
+/// order-of-magnitude, not precise accounting.
+fn approx_tokens(s: &str) -> usize {
+    let total: usize = s.chars().count();
+    if total == 0 {
+        return 0;
+    }
+    let cjk = s.chars().filter(|c| is_cjk(*c)).count();
+    let cjk_ratio = cjk as f64 / total as f64;
+    if cjk_ratio > 0.30 {
+        total.div_ceil(2)
+    } else {
+        total.div_ceil(4)
+    }
+}
+
+fn is_cjk(c: char) -> bool {
+    let n = c as u32;
+    (0x4E00..=0x9FFF).contains(&n)   // CJK Unified Ideographs
+        || (0x3040..=0x309F).contains(&n) // Hiragana
+        || (0x30A0..=0x30FF).contains(&n) // Katakana
+        || (0xAC00..=0xD7AF).contains(&n) // Hangul Syllables
+        || (0x3400..=0x4DBF).contains(&n) // CJK Extension A
+}
+
+// ---------------------------------------------------------------------------
+// Output: ASCII / Unicode box
+// ---------------------------------------------------------------------------
+
+const BOX_WIDTH: usize = 62; // inner width between the two side borders
+
+fn print_box(r: &BenchResult) {
+    let host = display_host(&r.url);
+    let version = env!("CARGO_PKG_VERSION");
+
+    let top = "─".repeat(BOX_WIDTH);
+    let sep = "─".repeat(BOX_WIDTH);
+
+    // Header: host on the left, "webclaw X.Y.Z" on the right.
+    let left = host;
+    let right = format!("webclaw {version}");
+    let pad = BOX_WIDTH.saturating_sub(left.chars().count() + right.chars().count() + 2);
+    let header = format!(" {}{}{} ", left, " ".repeat(pad), right);
+
+    println!("┌{top}┐");
+    println!("│{header}│");
+    println!("├{sep}┤");
+    print_row(
+        "raw HTML",
+        &format!("{} ≈ tokens", fmt_int(r.raw_tokens)),
+        &fmt_bytes(r.raw_bytes),
+    );
+    print_row(
+        "--format llm",
+        &format!("{} ≈ tokens", fmt_int(r.llm_tokens)),
+        &fmt_bytes(r.llm_bytes),
+    );
+    print_row("token reduction", &format!("{:.1}%", r.reduction_pct), "");
+    print_row("extraction time", &format!("{:.2} s", r.elapsed_secs), "");
+    if let Some((found, total)) = r.facts {
+        let pct = if total == 0 {
+            0.0
+        } else {
+            100.0 * found as f64 / total as f64
+        };
+        print_row(
+            "facts preserved",
+            &format!("{found}/{total} ({pct:.1}%)"),
+            "",
+        );
+    }
+    println!("└{top}┘");
+    println!();
+    println!("note: token counts are approximate (chars/4 Latin, chars/2 CJK).");
+}
+
+fn print_row(label: &str, middle: &str, right: &str) {
+    // Layout inside the box:
+    //   " <label padded to 18>   <middle>   <right right-aligned to fit> "
+    let left_col = format!(" {:<18}", label);
+    let right_col = format!("{right} ");
+    let budget = BOX_WIDTH
+        .saturating_sub(left_col.chars().count())
+        .saturating_sub(right_col.chars().count());
+    let middle_col = format!("{:<width$}", middle, width = budget);
+    println!("│{left_col}{middle_col}{right_col}│");
+}
+
+fn fmt_int(n: usize) -> String {
+    // Comma-group thousands. Avoids pulling in num-format / thousands
+    // for one call site.
+    let s = n.to_string();
+    let bytes = s.as_bytes();
+    let mut out = String::with_capacity(s.len() + s.len() / 3);
+    for (i, b) in bytes.iter().enumerate() {
+        if i > 0 && (bytes.len() - i).is_multiple_of(3) {
+            out.push(',');
+        }
+        out.push(*b as char);
+    }
+    out
+}
+
+fn fmt_bytes(n: usize) -> String {
+    const KB: usize = 1024;
+    const MB: usize = KB * 1024;
+    if n >= MB {
+        format!("{:.1} MB", n as f64 / MB as f64)
+    } else if n >= KB {
+        format!("{} KB", n / KB)
+    } else {
+        format!("{n} B")
+    }
+}
+
+/// Best-effort host extraction — if the URL doesn't parse we fall back
+/// to the raw string so the box still prints something recognizable.
+fn display_host(url: &str) -> String {
+    url::Url::parse(url)
+        .ok()
+        .and_then(|u| u.host_str().map(|h| h.to_string()))
+        .unwrap_or_else(|| url.to_string())
+}
+
+// ---------------------------------------------------------------------------
+// JSON output — single line, stable key order for scripting / CI.
+// ---------------------------------------------------------------------------
+
+fn print_json(r: &BenchResult) {
+    let mut obj = serde_json::Map::new();
+    obj.insert("url".into(), r.url.clone().into());
+    obj.insert("raw_tokens".into(), r.raw_tokens.into());
+    obj.insert("raw_bytes".into(), r.raw_bytes.into());
+    obj.insert("llm_tokens".into(), r.llm_tokens.into());
+    obj.insert("llm_bytes".into(), r.llm_bytes.into());
+    obj.insert("token_reduction_pct".into(), round1(r.reduction_pct).into());
+    obj.insert("elapsed_secs".into(), round2(r.elapsed_secs).into());
+    obj.insert("token_method".into(), "approx".into());
+    obj.insert("webclaw_version".into(), env!("CARGO_PKG_VERSION").into());
+    if let Some((found, total)) = r.facts {
+        obj.insert("facts_found".into(), found.into());
+        obj.insert("facts_total".into(), total.into());
+    }
+    // Single-line JSON — easy to append to ndjson for CI runs.
+    println!("{}", serde_json::Value::Object(obj));
+}
+
+fn round1(f: f64) -> f64 {
+    (f * 10.0).round() / 10.0
+}
+fn round2(f: f64) -> f64 {
+    (f * 100.0).round() / 100.0
+}
+
+// ---------------------------------------------------------------------------
+// Facts file support
+// ---------------------------------------------------------------------------
+
+/// Load `facts.json` (same schema as `benchmarks/facts.json`) and check how
+/// many curated facts for this URL appear in the extracted LLM text.
+/// Returns `None` when the URL has no entry in the file — don't penalize
+/// a site that simply hasn't been curated yet.
+fn check_facts(path: &Path, url: &str, llm_text: &str) -> Result<Option<(usize, usize)>, String> {
+    let raw = std::fs::read_to_string(path)
+        .map_err(|e| format!("read facts file {}: {e}", path.display()))?;
+    let parsed: serde_json::Value =
+        serde_json::from_str(&raw).map_err(|e| format!("parse facts file: {e}"))?;
+
+    let facts_obj = parsed
+        .get("facts")
+        .and_then(|v| v.as_object())
+        .ok_or_else(|| "facts file missing `facts` object".to_string())?;
+
+    let Some(entry) = facts_obj.get(url) else {
+        // URL not curated in this facts file — don't print a fidelity
+        // column rather than showing a misleading 0/0.
+        return Ok(None);
+    };
+    let Some(list) = entry.as_array() else {
+        return Err(format!("facts['{url}'] is not an array"));
+    };
+
+    let total = list.len();
+    let text_low = llm_text.to_lowercase();
+    let mut found = 0usize;
+    for f in list {
+        let Some(fact) = f.as_str() else { continue };
+        if matches_fact(&text_low, fact) {
+            found += 1;
+        }
+    }
+    Ok(Some((found, total)))
+}
+
+/// Match a single fact against the lowercased text. Mirrors the
+/// python harness in `benchmarks/scripts/bench.py`:
+/// - Single alphanumeric token → word-boundary (so `API` doesn't hit
+///   `apiece`).
+/// - Multi-word or non-alpha facts (e.g. `99.999`) → substring.
+fn matches_fact(text_low: &str, fact: &str) -> bool {
+    let fact_low = fact.to_lowercase();
+    if fact_low.is_empty() {
+        return false;
+    }
+    let is_simple_token = fact_low.chars().all(|c| c.is_ascii_alphanumeric())
+        && fact_low
+            .chars()
+            .next()
+            .is_some_and(|c| c.is_ascii_alphabetic());
+
+    if !is_simple_token {
+        return text_low.contains(&fact_low);
+    }
+    // Word-boundary scan without pulling in the regex dependency just
+    // for this: find each occurrence and check neighbouring chars.
+    let bytes = text_low.as_bytes();
+    let needle = fact_low.as_bytes();
+    let mut i = 0;
+    while i + needle.len() <= bytes.len() {
+        if &bytes[i..i + needle.len()] == needle {
+            let before_ok = i == 0 || !bytes[i - 1].is_ascii_alphanumeric();
+            let after_idx = i + needle.len();
+            let after_ok = after_idx >= bytes.len() || !bytes[after_idx].is_ascii_alphanumeric();
+            if before_ok && after_ok {
+                return true;
+            }
+        }
+        i += 1;
+    }
+    false
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn approx_tokens_empty() {
+        assert_eq!(approx_tokens(""), 0);
+    }
+
+    #[test]
+    fn approx_tokens_latin_roughly_chars_over_4() {
+        // 100 ASCII chars → ~25 tokens
+        let s = "a".repeat(100);
+        assert_eq!(approx_tokens(&s), 25);
+    }
+
+    #[test]
+    fn approx_tokens_cjk_denser() {
+        // 100 CJK chars → ~50 tokens (chars/2 branch)
+        let s: String = "中".repeat(100);
+        assert_eq!(approx_tokens(&s), 50);
+    }
+
+    #[test]
+    fn approx_tokens_mixed_uses_latin_branch() {
+        // 80 latin + 20 CJK → CJK ratio 20% < 30% → chars/4 branch
+        let s = format!("{}{}", "a".repeat(80), "中".repeat(20));
+        assert_eq!(approx_tokens(&s), 25);
+    }
+
+    #[test]
+    fn fmt_int_commas() {
+        assert_eq!(fmt_int(0), "0");
+        assert_eq!(fmt_int(100), "100");
+        assert_eq!(fmt_int(1_000), "1,000");
+        assert_eq!(fmt_int(243_465), "243,465");
+        assert_eq!(fmt_int(12_345_678), "12,345,678");
+    }
+
+    #[test]
+    fn fmt_bytes_units() {
+        assert_eq!(fmt_bytes(500), "500 B");
+        assert_eq!(fmt_bytes(1024), "1 KB");
+        assert_eq!(fmt_bytes(1024 * 1024), "1.0 MB");
+        assert_eq!(fmt_bytes(1024 * 1024 * 3 + 1024 * 512), "3.5 MB");
+    }
+
+    #[test]
+    fn matches_fact_word_boundary() {
+        assert!(matches_fact("the api is ready", "API"));
+        // single-token alphanumeric: API should not hit apiece
+        assert!(!matches_fact("an apiece of land", "API"));
+    }
+
+    #[test]
+    fn matches_fact_multiword_substring() {
+        assert!(matches_fact("uptime is 99.999% this year", "99.999"));
+        assert!(matches_fact("the app router routes requests", "App Router"));
+    }
+
+    #[test]
+    fn matches_fact_case_insensitive() {
+        assert!(matches_fact("the claude model is opus", "Claude"));
+        assert!(matches_fact("the claude model is opus", "opus"));
+    }
+
+    #[test]
+    fn matches_fact_missing() {
+        assert!(!matches_fact("nothing to see here", "vercel"));
+    }
+
+    #[test]
+    fn display_host_parses_url() {
+        assert_eq!(display_host("https://stripe.com/"), "stripe.com");
+        assert_eq!(
+            display_host("https://docs.python.org/3/"),
+            "docs.python.org"
+        );
+    }
+
+    #[test]
+    fn display_host_falls_back_on_garbage() {
+        assert_eq!(display_host("not a url"), "not a url");
+    }
+}
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -1,5 +1,6 @@
 /// CLI entry point -- wires webclaw-core and webclaw-fetch into a single command.
 /// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
+mod bench;
 mod cloud;

 use std::io::{self, Read as _};
@ -8,7 +9,7 @@ use std::process;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};

-use clap::{Parser, ValueEnum};
+use clap::{Parser, Subcommand, ValueEnum};
 use tracing_subscriber::EnvFilter;
 use webclaw_core::{
    ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
@ -86,6 +87,12 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
 #[derive(Parser)]
 #[command(name = "webclaw", about = "Extract web content for LLMs", version)]
 struct Cli {
+    /// Optional subcommand. When omitted, the CLI falls back to the
+    /// traditional flag-based flow (URL + --format, --crawl, etc.).
+    /// Subcommands are used for flows that don't fit that model.
+    #[command(subcommand)]
+    command: Option<Commands>,
+
    /// URLs to fetch (multiple allowed)
    #[arg()]
    urls: Vec<String>,
@ -283,6 +290,27 @@ struct Cli {
    output_dir: Option<PathBuf>,
 }

+#[derive(Subcommand)]
+enum Commands {
+    /// Per-URL extraction micro-benchmark: compares raw HTML vs. the
+    /// webclaw --format llm output on token count, bytes, and
+    /// extraction time. Uses an approximate tokenizer (see `--help`).
+    Bench {
+        /// URL to benchmark.
+        url: String,
+
+        /// Emit a single JSON line instead of the ASCII table.
+        /// Machine-readable shape stable across releases.
+        #[arg(long)]
+        json: bool,
+
+        /// Optional path to a facts.json (same schema as the repo's
+        /// benchmarks/facts.json) for a fidelity column.
+        #[arg(long)]
+        facts: Option<PathBuf>,
+    },
+}
+
 #[derive(Clone, ValueEnum)]
 enum OutputFormat {
    Markdown,
@ -2244,6 +2272,26 @@ async fn main() {
    let cli = Cli::parse();
    init_logging(cli.verbose);

+    // Subcommand path. Handled before the flag dispatch so a subcommand
+    // can't collide with a flag-based flow. When no subcommand is set
+    // we fall through to the existing behaviour.
+    if let Some(ref cmd) = cli.command {
+        match cmd {
+            Commands::Bench { url, json, facts } => {
+                let args = bench::BenchArgs {
+                    url: url.clone(),
+                    json: *json,
+                    facts: facts.clone(),
+                };
+                if let Err(e) = bench::run(&args).await {
+                    eprintln!("error: {e}");
+                    process::exit(1);
+                }
+                return;
+            }
+        }
+    }
+
    // --map: sitemap discovery mode
    if cli.map {
        if let Err(e) = run_map(&cli).await {