feat(cli): add webclaw bench <url> subcommand (closes #26)

Per-URL extraction micro-benchmark. Fetches a URL once, runs the same
pipeline as --format llm, prints a small ASCII table comparing raw
HTML vs. llm output on tokens, bytes, and extraction time.

  webclaw bench https://stripe.com               # ASCII table
  webclaw bench https://stripe.com --json        # one-line JSON
  webclaw bench https://stripe.com --facts FILE  # adds fidelity row

The --facts file uses the same schema as benchmarks/facts.json (curated
visible-fact list per URL). URLs not in the file produce no fidelity
row, so an uncurated site doesn't show 0/0.

v1 uses an approximate tokenizer (chars/4 Latin, chars/2 when CJK
dominates). Off by ~10% vs cl100k_base but the signal — 'is the LLM
output 90% smaller than the raw HTML' — is order-of-magnitude, not
precise accounting. Output is labeled '~ tokens' so nobody mistakes
it for a real BPE count. Swapping in tiktoken-rs later is a one
function change; left out of v1 to avoid the 2 MB BPE-data binary
bloat for a feature most users will run a handful of times.

Implemented as a real clap subcommand (clap::Subcommand) rather than
yet another flag, with the existing flag-based flow falling through
when no subcommand is given. Existing 'webclaw <url> --format ...'
invocations work exactly as before. Lays the groundwork for future
subcommands without disrupting the legacy flat-flag UX.

12 new unit tests cover the tokenizer, formatters, host extraction,
and fact-matching. Verified end-to-end on example.com and tavily.com
(5/5 facts preserved at 93% token reduction).
This commit is contained in:
Valerio 2026-04-22 12:25:29 +02:00
parent 2ba682adf3
commit d91ad9c1f4
2 changed files with 471 additions and 1 deletions

View file

@ -0,0 +1,422 @@
//! `webclaw bench <url>` — per-URL extraction micro-benchmark.
//!
//! Fetches a page, extracts it via the same pipeline that powers
//! `--format llm`, and reports how many tokens the LLM pipeline
//! removed vs. the raw HTML. Optional `--facts` reuses the
//! benchmark harness's curated fact lists to score fidelity.
//!
//! v1 uses an *approximate* tokenizer (chars/4 for Latin text,
//! chars/2 for CJK-heavy text). Output is clearly labeled
//! "≈ tokens" so nobody mistakes it for a real tiktoken run.
//! Swapping to tiktoken-rs later is a one-function change.
use std::path::{Path, PathBuf};
use std::time::Instant;
use webclaw_core::{extract, to_llm_text};
use webclaw_fetch::{BrowserProfile, FetchClient, FetchConfig};
/// Inputs collected from the clap subcommand.
pub struct BenchArgs {
pub url: String,
pub json: bool,
pub facts: Option<PathBuf>,
}
/// What a single bench run measures.
struct BenchResult {
url: String,
raw_tokens: usize,
raw_bytes: usize,
llm_tokens: usize,
llm_bytes: usize,
reduction_pct: f64,
elapsed_secs: f64,
/// `Some((found, total))` when `--facts` is supplied and the URL has
/// an entry in the facts file; `None` otherwise.
facts: Option<(usize, usize)>,
}
pub async fn run(args: &BenchArgs) -> Result<(), String> {
// Dedicated client so bench doesn't care about global CLI flags
// (proxies, custom headers, etc.). A reproducible microbench is
// more useful than an over-configurable one; if someone wants to
// bench behind a proxy they can set WEBCLAW_PROXY — respected
// by FetchConfig via the regular channels if we extend later.
let config = FetchConfig {
browser: BrowserProfile::Chrome,
..FetchConfig::default()
};
let client = FetchClient::new(config).map_err(|e| format!("build client: {e}"))?;
let start = Instant::now();
let fetched = client
.fetch(&args.url)
.await
.map_err(|e| format!("fetch: {e}"))?;
let extraction =
extract(&fetched.html, Some(&fetched.url)).map_err(|e| format!("extract: {e}"))?;
let llm_text = to_llm_text(&extraction, Some(&fetched.url));
let elapsed = start.elapsed();
let raw_tokens = approx_tokens(&fetched.html);
let llm_tokens = approx_tokens(&llm_text);
let raw_bytes = fetched.html.len();
let llm_bytes = llm_text.len();
let reduction_pct = if raw_tokens == 0 {
0.0
} else {
100.0 * (1.0 - llm_tokens as f64 / raw_tokens as f64)
};
let facts = match args.facts.as_deref() {
Some(path) => check_facts(path, &args.url, &llm_text)?,
None => None,
};
let result = BenchResult {
url: args.url.clone(),
raw_tokens,
raw_bytes,
llm_tokens,
llm_bytes,
reduction_pct,
elapsed_secs: elapsed.as_secs_f64(),
facts,
};
if args.json {
print_json(&result);
} else {
print_box(&result);
}
Ok(())
}
// ---------------------------------------------------------------------------
// Approximate tokenizer
// ---------------------------------------------------------------------------
/// Rough token count. `chars / 4` is the classic English rule of thumb
/// (close to cl100k_base for typical prose). CJK scripts pack ~2 chars
/// per token, so we switch to `chars / 2` when CJK dominates.
///
/// Off by ±10% vs. a real BPE tokenizer, which is fine for "is webclaw's
/// output 66% smaller or 66% bigger than raw HTML" — the signal is
/// order-of-magnitude, not precise accounting.
fn approx_tokens(s: &str) -> usize {
let total: usize = s.chars().count();
if total == 0 {
return 0;
}
let cjk = s.chars().filter(|c| is_cjk(*c)).count();
let cjk_ratio = cjk as f64 / total as f64;
if cjk_ratio > 0.30 {
total.div_ceil(2)
} else {
total.div_ceil(4)
}
}
fn is_cjk(c: char) -> bool {
let n = c as u32;
(0x4E00..=0x9FFF).contains(&n) // CJK Unified Ideographs
|| (0x3040..=0x309F).contains(&n) // Hiragana
|| (0x30A0..=0x30FF).contains(&n) // Katakana
|| (0xAC00..=0xD7AF).contains(&n) // Hangul Syllables
|| (0x3400..=0x4DBF).contains(&n) // CJK Extension A
}
// ---------------------------------------------------------------------------
// Output: ASCII / Unicode box
// ---------------------------------------------------------------------------
const BOX_WIDTH: usize = 62; // inner width between the two side borders
fn print_box(r: &BenchResult) {
let host = display_host(&r.url);
let version = env!("CARGO_PKG_VERSION");
let top = "".repeat(BOX_WIDTH);
let sep = "".repeat(BOX_WIDTH);
// Header: host on the left, "webclaw X.Y.Z" on the right.
let left = host;
let right = format!("webclaw {version}");
let pad = BOX_WIDTH.saturating_sub(left.chars().count() + right.chars().count() + 2);
let header = format!(" {}{}{} ", left, " ".repeat(pad), right);
println!("{top}");
println!("{header}");
println!("{sep}");
print_row(
"raw HTML",
&format!("{} ≈ tokens", fmt_int(r.raw_tokens)),
&fmt_bytes(r.raw_bytes),
);
print_row(
"--format llm",
&format!("{} ≈ tokens", fmt_int(r.llm_tokens)),
&fmt_bytes(r.llm_bytes),
);
print_row("token reduction", &format!("{:.1}%", r.reduction_pct), "");
print_row("extraction time", &format!("{:.2} s", r.elapsed_secs), "");
if let Some((found, total)) = r.facts {
let pct = if total == 0 {
0.0
} else {
100.0 * found as f64 / total as f64
};
print_row(
"facts preserved",
&format!("{found}/{total} ({pct:.1}%)"),
"",
);
}
println!("{top}");
println!();
println!("note: token counts are approximate (chars/4 Latin, chars/2 CJK).");
}
fn print_row(label: &str, middle: &str, right: &str) {
// Layout inside the box:
// " <label padded to 18> <middle> <right right-aligned to fit> "
let left_col = format!(" {:<18}", label);
let right_col = format!("{right} ");
let budget = BOX_WIDTH
.saturating_sub(left_col.chars().count())
.saturating_sub(right_col.chars().count());
let middle_col = format!("{:<width$}", middle, width = budget);
println!("{left_col}{middle_col}{right_col}");
}
fn fmt_int(n: usize) -> String {
// Comma-group thousands. Avoids pulling in num-format / thousands
// for one call site.
let s = n.to_string();
let bytes = s.as_bytes();
let mut out = String::with_capacity(s.len() + s.len() / 3);
for (i, b) in bytes.iter().enumerate() {
if i > 0 && (bytes.len() - i).is_multiple_of(3) {
out.push(',');
}
out.push(*b as char);
}
out
}
fn fmt_bytes(n: usize) -> String {
const KB: usize = 1024;
const MB: usize = KB * 1024;
if n >= MB {
format!("{:.1} MB", n as f64 / MB as f64)
} else if n >= KB {
format!("{} KB", n / KB)
} else {
format!("{n} B")
}
}
/// Best-effort host extraction — if the URL doesn't parse we fall back
/// to the raw string so the box still prints something recognizable.
fn display_host(url: &str) -> String {
url::Url::parse(url)
.ok()
.and_then(|u| u.host_str().map(|h| h.to_string()))
.unwrap_or_else(|| url.to_string())
}
// ---------------------------------------------------------------------------
// JSON output — single line, stable key order for scripting / CI.
// ---------------------------------------------------------------------------
fn print_json(r: &BenchResult) {
let mut obj = serde_json::Map::new();
obj.insert("url".into(), r.url.clone().into());
obj.insert("raw_tokens".into(), r.raw_tokens.into());
obj.insert("raw_bytes".into(), r.raw_bytes.into());
obj.insert("llm_tokens".into(), r.llm_tokens.into());
obj.insert("llm_bytes".into(), r.llm_bytes.into());
obj.insert("token_reduction_pct".into(), round1(r.reduction_pct).into());
obj.insert("elapsed_secs".into(), round2(r.elapsed_secs).into());
obj.insert("token_method".into(), "approx".into());
obj.insert("webclaw_version".into(), env!("CARGO_PKG_VERSION").into());
if let Some((found, total)) = r.facts {
obj.insert("facts_found".into(), found.into());
obj.insert("facts_total".into(), total.into());
}
// Single-line JSON — easy to append to ndjson for CI runs.
println!("{}", serde_json::Value::Object(obj));
}
fn round1(f: f64) -> f64 {
(f * 10.0).round() / 10.0
}
fn round2(f: f64) -> f64 {
(f * 100.0).round() / 100.0
}
// ---------------------------------------------------------------------------
// Facts file support
// ---------------------------------------------------------------------------
/// Load `facts.json` (same schema as `benchmarks/facts.json`) and check how
/// many curated facts for this URL appear in the extracted LLM text.
/// Returns `None` when the URL has no entry in the file — don't penalize
/// a site that simply hasn't been curated yet.
fn check_facts(path: &Path, url: &str, llm_text: &str) -> Result<Option<(usize, usize)>, String> {
let raw = std::fs::read_to_string(path)
.map_err(|e| format!("read facts file {}: {e}", path.display()))?;
let parsed: serde_json::Value =
serde_json::from_str(&raw).map_err(|e| format!("parse facts file: {e}"))?;
let facts_obj = parsed
.get("facts")
.and_then(|v| v.as_object())
.ok_or_else(|| "facts file missing `facts` object".to_string())?;
let Some(entry) = facts_obj.get(url) else {
// URL not curated in this facts file — don't print a fidelity
// column rather than showing a misleading 0/0.
return Ok(None);
};
let Some(list) = entry.as_array() else {
return Err(format!("facts['{url}'] is not an array"));
};
let total = list.len();
let text_low = llm_text.to_lowercase();
let mut found = 0usize;
for f in list {
let Some(fact) = f.as_str() else { continue };
if matches_fact(&text_low, fact) {
found += 1;
}
}
Ok(Some((found, total)))
}
/// Match a single fact against the lowercased text. Mirrors the
/// python harness in `benchmarks/scripts/bench.py`:
/// - Single alphanumeric token → word-boundary (so `API` doesn't hit
/// `apiece`).
/// - Multi-word or non-alpha facts (e.g. `99.999`) → substring.
fn matches_fact(text_low: &str, fact: &str) -> bool {
let fact_low = fact.to_lowercase();
if fact_low.is_empty() {
return false;
}
let is_simple_token = fact_low.chars().all(|c| c.is_ascii_alphanumeric())
&& fact_low
.chars()
.next()
.is_some_and(|c| c.is_ascii_alphabetic());
if !is_simple_token {
return text_low.contains(&fact_low);
}
// Word-boundary scan without pulling in the regex dependency just
// for this: find each occurrence and check neighbouring chars.
let bytes = text_low.as_bytes();
let needle = fact_low.as_bytes();
let mut i = 0;
while i + needle.len() <= bytes.len() {
if &bytes[i..i + needle.len()] == needle {
let before_ok = i == 0 || !bytes[i - 1].is_ascii_alphanumeric();
let after_idx = i + needle.len();
let after_ok = after_idx >= bytes.len() || !bytes[after_idx].is_ascii_alphanumeric();
if before_ok && after_ok {
return true;
}
}
i += 1;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn approx_tokens_empty() {
assert_eq!(approx_tokens(""), 0);
}
#[test]
fn approx_tokens_latin_roughly_chars_over_4() {
// 100 ASCII chars → ~25 tokens
let s = "a".repeat(100);
assert_eq!(approx_tokens(&s), 25);
}
#[test]
fn approx_tokens_cjk_denser() {
// 100 CJK chars → ~50 tokens (chars/2 branch)
let s: String = "".repeat(100);
assert_eq!(approx_tokens(&s), 50);
}
#[test]
fn approx_tokens_mixed_uses_latin_branch() {
// 80 latin + 20 CJK → CJK ratio 20% < 30% → chars/4 branch
let s = format!("{}{}", "a".repeat(80), "".repeat(20));
assert_eq!(approx_tokens(&s), 25);
}
#[test]
fn fmt_int_commas() {
assert_eq!(fmt_int(0), "0");
assert_eq!(fmt_int(100), "100");
assert_eq!(fmt_int(1_000), "1,000");
assert_eq!(fmt_int(243_465), "243,465");
assert_eq!(fmt_int(12_345_678), "12,345,678");
}
#[test]
fn fmt_bytes_units() {
assert_eq!(fmt_bytes(500), "500 B");
assert_eq!(fmt_bytes(1024), "1 KB");
assert_eq!(fmt_bytes(1024 * 1024), "1.0 MB");
assert_eq!(fmt_bytes(1024 * 1024 * 3 + 1024 * 512), "3.5 MB");
}
#[test]
fn matches_fact_word_boundary() {
assert!(matches_fact("the api is ready", "API"));
// single-token alphanumeric: API should not hit apiece
assert!(!matches_fact("an apiece of land", "API"));
}
#[test]
fn matches_fact_multiword_substring() {
assert!(matches_fact("uptime is 99.999% this year", "99.999"));
assert!(matches_fact("the app router routes requests", "App Router"));
}
#[test]
fn matches_fact_case_insensitive() {
assert!(matches_fact("the claude model is opus", "Claude"));
assert!(matches_fact("the claude model is opus", "opus"));
}
#[test]
fn matches_fact_missing() {
assert!(!matches_fact("nothing to see here", "vercel"));
}
#[test]
fn display_host_parses_url() {
assert_eq!(display_host("https://stripe.com/"), "stripe.com");
assert_eq!(
display_host("https://docs.python.org/3/"),
"docs.python.org"
);
}
#[test]
fn display_host_falls_back_on_garbage() {
assert_eq!(display_host("not a url"), "not a url");
}
}

View file

@ -1,5 +1,6 @@
/// CLI entry point -- wires webclaw-core and webclaw-fetch into a single command. /// CLI entry point -- wires webclaw-core and webclaw-fetch into a single command.
/// All extraction and fetching logic lives in sibling crates; this is pure plumbing. /// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
mod bench;
mod cloud; mod cloud;
use std::io::{self, Read as _}; use std::io::{self, Read as _};
@ -8,7 +9,7 @@ use std::process;
use std::sync::Arc; use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool, Ordering};
use clap::{Parser, ValueEnum}; use clap::{Parser, Subcommand, ValueEnum};
use tracing_subscriber::EnvFilter; use tracing_subscriber::EnvFilter;
use webclaw_core::{ use webclaw_core::{
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options, ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
@ -86,6 +87,12 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
#[derive(Parser)] #[derive(Parser)]
#[command(name = "webclaw", about = "Extract web content for LLMs", version)] #[command(name = "webclaw", about = "Extract web content for LLMs", version)]
struct Cli { struct Cli {
/// Optional subcommand. When omitted, the CLI falls back to the
/// traditional flag-based flow (URL + --format, --crawl, etc.).
/// Subcommands are used for flows that don't fit that model.
#[command(subcommand)]
command: Option<Commands>,
/// URLs to fetch (multiple allowed) /// URLs to fetch (multiple allowed)
#[arg()] #[arg()]
urls: Vec<String>, urls: Vec<String>,
@ -283,6 +290,27 @@ struct Cli {
output_dir: Option<PathBuf>, output_dir: Option<PathBuf>,
} }
#[derive(Subcommand)]
enum Commands {
/// Per-URL extraction micro-benchmark: compares raw HTML vs. the
/// webclaw --format llm output on token count, bytes, and
/// extraction time. Uses an approximate tokenizer (see `--help`).
Bench {
/// URL to benchmark.
url: String,
/// Emit a single JSON line instead of the ASCII table.
/// Machine-readable shape stable across releases.
#[arg(long)]
json: bool,
/// Optional path to a facts.json (same schema as the repo's
/// benchmarks/facts.json) for a fidelity column.
#[arg(long)]
facts: Option<PathBuf>,
},
}
#[derive(Clone, ValueEnum)] #[derive(Clone, ValueEnum)]
enum OutputFormat { enum OutputFormat {
Markdown, Markdown,
@ -2244,6 +2272,26 @@ async fn main() {
let cli = Cli::parse(); let cli = Cli::parse();
init_logging(cli.verbose); init_logging(cli.verbose);
// Subcommand path. Handled before the flag dispatch so a subcommand
// can't collide with a flag-based flow. When no subcommand is set
// we fall through to the existing behaviour.
if let Some(ref cmd) = cli.command {
match cmd {
Commands::Bench { url, json, facts } => {
let args = bench::BenchArgs {
url: url.clone(),
json: *json,
facts: facts.clone(),
};
if let Err(e) = bench::run(&args).await {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
}
}
// --map: sitemap discovery mode // --map: sitemap discovery mode
if cli.map { if cli.map {
if let Err(e) = run_map(&cli).await { if let Err(e) = run_map(&cli).await {