diff --git a/crates/webclaw-cli/src/bench.rs b/crates/webclaw-cli/src/bench.rs new file mode 100644 index 0000000..3e45da4 --- /dev/null +++ b/crates/webclaw-cli/src/bench.rs @@ -0,0 +1,422 @@ +//! `webclaw bench ` — per-URL extraction micro-benchmark. +//! +//! Fetches a page, extracts it via the same pipeline that powers +//! `--format llm`, and reports how many tokens the LLM pipeline +//! removed vs. the raw HTML. Optional `--facts` reuses the +//! benchmark harness's curated fact lists to score fidelity. +//! +//! v1 uses an *approximate* tokenizer (chars/4 for Latin text, +//! chars/2 for CJK-heavy text). Output is clearly labeled +//! "≈ tokens" so nobody mistakes it for a real tiktoken run. +//! Swapping to tiktoken-rs later is a one-function change. + +use std::path::{Path, PathBuf}; +use std::time::Instant; + +use webclaw_core::{extract, to_llm_text}; +use webclaw_fetch::{BrowserProfile, FetchClient, FetchConfig}; + +/// Inputs collected from the clap subcommand. +pub struct BenchArgs { + pub url: String, + pub json: bool, + pub facts: Option, +} + +/// What a single bench run measures. +struct BenchResult { + url: String, + raw_tokens: usize, + raw_bytes: usize, + llm_tokens: usize, + llm_bytes: usize, + reduction_pct: f64, + elapsed_secs: f64, + /// `Some((found, total))` when `--facts` is supplied and the URL has + /// an entry in the facts file; `None` otherwise. + facts: Option<(usize, usize)>, +} + +pub async fn run(args: &BenchArgs) -> Result<(), String> { + // Dedicated client so bench doesn't care about global CLI flags + // (proxies, custom headers, etc.). A reproducible microbench is + // more useful than an over-configurable one; if someone wants to + // bench behind a proxy they can set WEBCLAW_PROXY — respected + // by FetchConfig via the regular channels if we extend later. + let config = FetchConfig { + browser: BrowserProfile::Chrome, + ..FetchConfig::default() + }; + let client = FetchClient::new(config).map_err(|e| format!("build client: {e}"))?; + + let start = Instant::now(); + let fetched = client + .fetch(&args.url) + .await + .map_err(|e| format!("fetch: {e}"))?; + + let extraction = + extract(&fetched.html, Some(&fetched.url)).map_err(|e| format!("extract: {e}"))?; + let llm_text = to_llm_text(&extraction, Some(&fetched.url)); + let elapsed = start.elapsed(); + + let raw_tokens = approx_tokens(&fetched.html); + let llm_tokens = approx_tokens(&llm_text); + let raw_bytes = fetched.html.len(); + let llm_bytes = llm_text.len(); + let reduction_pct = if raw_tokens == 0 { + 0.0 + } else { + 100.0 * (1.0 - llm_tokens as f64 / raw_tokens as f64) + }; + + let facts = match args.facts.as_deref() { + Some(path) => check_facts(path, &args.url, &llm_text)?, + None => None, + }; + + let result = BenchResult { + url: args.url.clone(), + raw_tokens, + raw_bytes, + llm_tokens, + llm_bytes, + reduction_pct, + elapsed_secs: elapsed.as_secs_f64(), + facts, + }; + + if args.json { + print_json(&result); + } else { + print_box(&result); + } + Ok(()) +} + +// --------------------------------------------------------------------------- +// Approximate tokenizer +// --------------------------------------------------------------------------- + +/// Rough token count. `chars / 4` is the classic English rule of thumb +/// (close to cl100k_base for typical prose). CJK scripts pack ~2 chars +/// per token, so we switch to `chars / 2` when CJK dominates. +/// +/// Off by ±10% vs. a real BPE tokenizer, which is fine for "is webclaw's +/// output 66% smaller or 66% bigger than raw HTML" — the signal is +/// order-of-magnitude, not precise accounting. +fn approx_tokens(s: &str) -> usize { + let total: usize = s.chars().count(); + if total == 0 { + return 0; + } + let cjk = s.chars().filter(|c| is_cjk(*c)).count(); + let cjk_ratio = cjk as f64 / total as f64; + if cjk_ratio > 0.30 { + total.div_ceil(2) + } else { + total.div_ceil(4) + } +} + +fn is_cjk(c: char) -> bool { + let n = c as u32; + (0x4E00..=0x9FFF).contains(&n) // CJK Unified Ideographs + || (0x3040..=0x309F).contains(&n) // Hiragana + || (0x30A0..=0x30FF).contains(&n) // Katakana + || (0xAC00..=0xD7AF).contains(&n) // Hangul Syllables + || (0x3400..=0x4DBF).contains(&n) // CJK Extension A +} + +// --------------------------------------------------------------------------- +// Output: ASCII / Unicode box +// --------------------------------------------------------------------------- + +const BOX_WIDTH: usize = 62; // inner width between the two side borders + +fn print_box(r: &BenchResult) { + let host = display_host(&r.url); + let version = env!("CARGO_PKG_VERSION"); + + let top = "─".repeat(BOX_WIDTH); + let sep = "─".repeat(BOX_WIDTH); + + // Header: host on the left, "webclaw X.Y.Z" on the right. + let left = host; + let right = format!("webclaw {version}"); + let pad = BOX_WIDTH.saturating_sub(left.chars().count() + right.chars().count() + 2); + let header = format!(" {}{}{} ", left, " ".repeat(pad), right); + + println!("┌{top}┐"); + println!("│{header}│"); + println!("├{sep}┤"); + print_row( + "raw HTML", + &format!("{} ≈ tokens", fmt_int(r.raw_tokens)), + &fmt_bytes(r.raw_bytes), + ); + print_row( + "--format llm", + &format!("{} ≈ tokens", fmt_int(r.llm_tokens)), + &fmt_bytes(r.llm_bytes), + ); + print_row("token reduction", &format!("{:.1}%", r.reduction_pct), ""); + print_row("extraction time", &format!("{:.2} s", r.elapsed_secs), ""); + if let Some((found, total)) = r.facts { + let pct = if total == 0 { + 0.0 + } else { + 100.0 * found as f64 / total as f64 + }; + print_row( + "facts preserved", + &format!("{found}/{total} ({pct:.1}%)"), + "", + ); + } + println!("└{top}┘"); + println!(); + println!("note: token counts are approximate (chars/4 Latin, chars/2 CJK)."); +} + +fn print_row(label: &str, middle: &str, right: &str) { + // Layout inside the box: + // "