feat(cli): add webclaw bench <url> subcommand (closes #26)

Per-URL extraction micro-benchmark. Fetches a URL once, runs the same pipeline as --format llm, prints a small ASCII table comparing raw HTML vs. llm output on tokens, bytes, and extraction time. webclaw bench https://stripe.com # ASCII table webclaw bench https://stripe.com --json # one-line JSON webclaw bench https://stripe.com --facts FILE # adds fidelity row The --facts file uses the same schema as benchmarks/facts.json (curated visible-fact list per URL). URLs not in the file produce no fidelity row, so an uncurated site doesn't show 0/0. v1 uses an approximate tokenizer (chars/4 Latin, chars/2 when CJK dominates). Off by ~10% vs cl100k_base but the signal — 'is the LLM output 90% smaller than the raw HTML' — is order-of-magnitude, not precise accounting. Output is labeled '~ tokens' so nobody mistakes it for a real BPE count. Swapping in tiktoken-rs later is a one function change; left out of v1 to avoid the 2 MB BPE-data binary bloat for a feature most users will run a handful of times. Implemented as a real clap subcommand (clap::Subcommand) rather than yet another flag, with the existing flag-based flow falling through when no subcommand is given. Existing 'webclaw <url> --format ...' invocations work exactly as before. Lays the groundwork for future subcommands without disrupting the legacy flat-flag UX. 12 new unit tests cover the tokenizer, formatters, host extraction, and fact-matching. Verified end-to-end on example.com and tavily.com (5/5 facts preserved at 93% token reduction).
2026-06-15 23:35:14 +02:00 · 2026-04-22 12:25:29 +02:00 · 2026-04-22 12:25:29 +02:00 · d91ad9c1f4
commit d91ad9c1f4
parent 2ba682adf3
2 changed files with 471 additions and 1 deletions
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -1,5 +1,6 @@
 /// CLI entry point -- wires webclaw-core and webclaw-fetch into a single command.
 /// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
+mod bench;
 mod cloud;

 use std::io::{self, Read as _};
@ -8,7 +9,7 @@ use std::process;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};

-use clap::{Parser, ValueEnum};
+use clap::{Parser, Subcommand, ValueEnum};
 use tracing_subscriber::EnvFilter;
 use webclaw_core::{
    ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
@ -86,6 +87,12 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
 #[derive(Parser)]
 #[command(name = "webclaw", about = "Extract web content for LLMs", version)]
 struct Cli {
+    /// Optional subcommand. When omitted, the CLI falls back to the
+    /// traditional flag-based flow (URL + --format, --crawl, etc.).
+    /// Subcommands are used for flows that don't fit that model.
+    #[command(subcommand)]
+    command: Option<Commands>,
+
    /// URLs to fetch (multiple allowed)
    #[arg()]
    urls: Vec<String>,
@ -283,6 +290,27 @@ struct Cli {
    output_dir: Option<PathBuf>,
 }

+#[derive(Subcommand)]
+enum Commands {
+    /// Per-URL extraction micro-benchmark: compares raw HTML vs. the
+    /// webclaw --format llm output on token count, bytes, and
+    /// extraction time. Uses an approximate tokenizer (see `--help`).
+    Bench {
+        /// URL to benchmark.
+        url: String,
+
+        /// Emit a single JSON line instead of the ASCII table.
+        /// Machine-readable shape stable across releases.
+        #[arg(long)]
+        json: bool,
+
+        /// Optional path to a facts.json (same schema as the repo's
+        /// benchmarks/facts.json) for a fidelity column.
+        #[arg(long)]
+        facts: Option<PathBuf>,
+    },
+}
+
 #[derive(Clone, ValueEnum)]
 enum OutputFormat {
    Markdown,
@ -2244,6 +2272,26 @@ async fn main() {
    let cli = Cli::parse();
    init_logging(cli.verbose);

+    // Subcommand path. Handled before the flag dispatch so a subcommand
+    // can't collide with a flag-based flow. When no subcommand is set
+    // we fall through to the existing behaviour.
+    if let Some(ref cmd) = cli.command {
+        match cmd {
+            Commands::Bench { url, json, facts } => {
+                let args = bench::BenchArgs {
+                    url: url.clone(),
+                    json: *json,
+                    facts: facts.clone(),
+                };
+                if let Err(e) = bench::run(&args).await {
+                    eprintln!("error: {e}");
+                    process::exit(1);
+                }
+                return;
+            }
+        }
+    }
+
    // --map: sitemap discovery mode
    if cli.map {
        if let Err(e) = run_map(&cli).await {