//! `webclaw bench ` — per-URL extraction micro-benchmark. //! //! Fetches a page, extracts it via the same pipeline that powers //! `--format llm`, and reports how many tokens the LLM pipeline //! removed vs. the raw HTML. Optional `--facts` reuses the //! benchmark harness's curated fact lists to score fidelity. //! //! v1 uses an *approximate* tokenizer (chars/4 for Latin text, //! chars/2 for CJK-heavy text). Output is clearly labeled //! "≈ tokens" so nobody mistakes it for a real tiktoken run. //! Swapping to tiktoken-rs later is a one-function change. use std::path::{Path, PathBuf}; use std::time::Instant; use webclaw_core::{extract, to_llm_text}; use webclaw_fetch::{BrowserProfile, FetchClient, FetchConfig}; /// Inputs collected from the clap subcommand. pub struct BenchArgs { pub url: String, pub json: bool, pub facts: Option, } /// What a single bench run measures. struct BenchResult { url: String, raw_tokens: usize, raw_bytes: usize, llm_tokens: usize, llm_bytes: usize, reduction_pct: f64, elapsed_secs: f64, /// `Some((found, total))` when `--facts` is supplied and the URL has /// an entry in the facts file; `None` otherwise. facts: Option<(usize, usize)>, } pub async fn run(args: &BenchArgs) -> Result<(), String> { // Dedicated client so bench doesn't care about global CLI flags // (proxies, custom headers, etc.). A reproducible microbench is // more useful than an over-configurable one; if someone wants to // bench behind a proxy they can set WEBCLAW_PROXY — respected // by FetchConfig via the regular channels if we extend later. let config = FetchConfig { browser: BrowserProfile::Chrome, ..FetchConfig::default() }; let client = FetchClient::new(config).map_err(|e| format!("build client: {e}"))?; let start = Instant::now(); let fetched = client .fetch(&args.url) .await .map_err(|e| format!("fetch: {e}"))?; let extraction = extract(&fetched.html, Some(&fetched.url)).map_err(|e| format!("extract: {e}"))?; let llm_text = to_llm_text(&extraction, Some(&fetched.url)); let elapsed = start.elapsed(); let raw_tokens = approx_tokens(&fetched.html); let llm_tokens = approx_tokens(&llm_text); let raw_bytes = fetched.html.len(); let llm_bytes = llm_text.len(); let reduction_pct = if raw_tokens == 0 { 0.0 } else { 100.0 * (1.0 - llm_tokens as f64 / raw_tokens as f64) }; let facts = match args.facts.as_deref() { Some(path) => check_facts(path, &args.url, &llm_text)?, None => None, }; let result = BenchResult { url: args.url.clone(), raw_tokens, raw_bytes, llm_tokens, llm_bytes, reduction_pct, elapsed_secs: elapsed.as_secs_f64(), facts, }; if args.json { print_json(&result); } else { print_box(&result); } Ok(()) } // --------------------------------------------------------------------------- // Approximate tokenizer // --------------------------------------------------------------------------- /// Rough token count. `chars / 4` is the classic English rule of thumb /// (close to cl100k_base for typical prose). CJK scripts pack ~2 chars /// per token, so we switch to `chars / 2` when CJK dominates. /// /// Off by ±10% vs. a real BPE tokenizer, which is fine for "is webclaw's /// output 66% smaller or 66% bigger than raw HTML" — the signal is /// order-of-magnitude, not precise accounting. fn approx_tokens(s: &str) -> usize { let total: usize = s.chars().count(); if total == 0 { return 0; } let cjk = s.chars().filter(|c| is_cjk(*c)).count(); let cjk_ratio = cjk as f64 / total as f64; if cjk_ratio > 0.30 { total.div_ceil(2) } else { total.div_ceil(4) } } fn is_cjk(c: char) -> bool { let n = c as u32; (0x4E00..=0x9FFF).contains(&n) // CJK Unified Ideographs || (0x3040..=0x309F).contains(&n) // Hiragana || (0x30A0..=0x30FF).contains(&n) // Katakana || (0xAC00..=0xD7AF).contains(&n) // Hangul Syllables || (0x3400..=0x4DBF).contains(&n) // CJK Extension A } // --------------------------------------------------------------------------- // Output: ASCII / Unicode box // --------------------------------------------------------------------------- const BOX_WIDTH: usize = 62; // inner width between the two side borders fn print_box(r: &BenchResult) { let host = display_host(&r.url); let version = env!("CARGO_PKG_VERSION"); let top = "─".repeat(BOX_WIDTH); let sep = "─".repeat(BOX_WIDTH); // Header: host on the left, "webclaw X.Y.Z" on the right. let left = host; let right = format!("webclaw {version}"); let pad = BOX_WIDTH.saturating_sub(left.chars().count() + right.chars().count() + 2); let header = format!(" {}{}{} ", left, " ".repeat(pad), right); println!("┌{top}┐"); println!("│{header}│"); println!("├{sep}┤"); print_row( "raw HTML", &format!("{} ≈ tokens", fmt_int(r.raw_tokens)), &fmt_bytes(r.raw_bytes), ); print_row( "--format llm", &format!("{} ≈ tokens", fmt_int(r.llm_tokens)), &fmt_bytes(r.llm_bytes), ); print_row("token reduction", &format!("{:.1}%", r.reduction_pct), ""); print_row("extraction time", &format!("{:.2} s", r.elapsed_secs), ""); if let Some((found, total)) = r.facts { let pct = if total == 0 { 0.0 } else { 100.0 * found as f64 / total as f64 }; print_row( "facts preserved", &format!("{found}/{total} ({pct:.1}%)"), "", ); } println!("└{top}┘"); println!(); println!("note: token counts are approximate (chars/4 Latin, chars/2 CJK)."); } fn print_row(label: &str, middle: &str, right: &str) { // Layout inside the box: // "