mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-15 23:35:14 +02:00
feat(cli): add webclaw bench <url> subcommand (closes #26)
Per-URL extraction micro-benchmark. Fetches a URL once, runs the same pipeline as --format llm, prints a small ASCII table comparing raw HTML vs. llm output on tokens, bytes, and extraction time. webclaw bench https://stripe.com # ASCII table webclaw bench https://stripe.com --json # one-line JSON webclaw bench https://stripe.com --facts FILE # adds fidelity row The --facts file uses the same schema as benchmarks/facts.json (curated visible-fact list per URL). URLs not in the file produce no fidelity row, so an uncurated site doesn't show 0/0. v1 uses an approximate tokenizer (chars/4 Latin, chars/2 when CJK dominates). Off by ~10% vs cl100k_base but the signal — 'is the LLM output 90% smaller than the raw HTML' — is order-of-magnitude, not precise accounting. Output is labeled '~ tokens' so nobody mistakes it for a real BPE count. Swapping in tiktoken-rs later is a one function change; left out of v1 to avoid the 2 MB BPE-data binary bloat for a feature most users will run a handful of times. Implemented as a real clap subcommand (clap::Subcommand) rather than yet another flag, with the existing flag-based flow falling through when no subcommand is given. Existing 'webclaw <url> --format ...' invocations work exactly as before. Lays the groundwork for future subcommands without disrupting the legacy flat-flag UX. 12 new unit tests cover the tokenizer, formatters, host extraction, and fact-matching. Verified end-to-end on example.com and tavily.com (5/5 facts preserved at 93% token reduction).
This commit is contained in:
parent
2ba682adf3
commit
d91ad9c1f4
2 changed files with 471 additions and 1 deletions
|
|
@ -1,5 +1,6 @@
|
|||
/// CLI entry point -- wires webclaw-core and webclaw-fetch into a single command.
|
||||
/// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
|
||||
mod bench;
|
||||
mod cloud;
|
||||
|
||||
use std::io::{self, Read as _};
|
||||
|
|
@ -8,7 +9,7 @@ use std::process;
|
|||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
use clap::{Parser, ValueEnum};
|
||||
use clap::{Parser, Subcommand, ValueEnum};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
use webclaw_core::{
|
||||
ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
|
||||
|
|
@ -86,6 +87,12 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
|
|||
#[derive(Parser)]
|
||||
#[command(name = "webclaw", about = "Extract web content for LLMs", version)]
|
||||
struct Cli {
|
||||
/// Optional subcommand. When omitted, the CLI falls back to the
|
||||
/// traditional flag-based flow (URL + --format, --crawl, etc.).
|
||||
/// Subcommands are used for flows that don't fit that model.
|
||||
#[command(subcommand)]
|
||||
command: Option<Commands>,
|
||||
|
||||
/// URLs to fetch (multiple allowed)
|
||||
#[arg()]
|
||||
urls: Vec<String>,
|
||||
|
|
@ -283,6 +290,27 @@ struct Cli {
|
|||
output_dir: Option<PathBuf>,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
/// Per-URL extraction micro-benchmark: compares raw HTML vs. the
|
||||
/// webclaw --format llm output on token count, bytes, and
|
||||
/// extraction time. Uses an approximate tokenizer (see `--help`).
|
||||
Bench {
|
||||
/// URL to benchmark.
|
||||
url: String,
|
||||
|
||||
/// Emit a single JSON line instead of the ASCII table.
|
||||
/// Machine-readable shape stable across releases.
|
||||
#[arg(long)]
|
||||
json: bool,
|
||||
|
||||
/// Optional path to a facts.json (same schema as the repo's
|
||||
/// benchmarks/facts.json) for a fidelity column.
|
||||
#[arg(long)]
|
||||
facts: Option<PathBuf>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum)]
|
||||
enum OutputFormat {
|
||||
Markdown,
|
||||
|
|
@ -2244,6 +2272,26 @@ async fn main() {
|
|||
let cli = Cli::parse();
|
||||
init_logging(cli.verbose);
|
||||
|
||||
// Subcommand path. Handled before the flag dispatch so a subcommand
|
||||
// can't collide with a flag-based flow. When no subcommand is set
|
||||
// we fall through to the existing behaviour.
|
||||
if let Some(ref cmd) = cli.command {
|
||||
match cmd {
|
||||
Commands::Bench { url, json, facts } => {
|
||||
let args = bench::BenchArgs {
|
||||
url: url.clone(),
|
||||
json: *json,
|
||||
facts: facts.clone(),
|
||||
};
|
||||
if let Err(e) = bench::run(&args).await {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --map: sitemap discovery mode
|
||||
if cli.map {
|
||||
if let Err(e) = run_map(&cli).await {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue