From 2ba682adf353d06b2707b66f15944340fdf76256 Mon Sep 17 00:00:00 2001 From: Valerio Date: Wed, 22 Apr 2026 12:25:11 +0200 Subject: [PATCH 01/30] feat(server): add OSS webclaw-server REST API binary (closes #29) Self-hosters hitting docs/self-hosting were promised three binaries but the OSS Docker image only shipped two. webclaw-server lived in the closed-source hosted-platform repo, which couldn't be opened. This adds a minimal axum REST API in the OSS repo so self-hosting actually works without pretending to ship the cloud platform. Crate at crates/webclaw-server/. Stateless, no database, no job queue, single binary. Endpoints: GET /health, POST /v1/{scrape, crawl, map, batch, extract, summarize, diff, brand}. JSON shapes mirror api.webclaw.io for the endpoints OSS can support, so swapping between self-hosted and hosted is a base-URL change. Auth: optional bearer token via WEBCLAW_API_KEY / --api-key. Comparison is constant-time (subtle::ConstantTimeEq). Open mode (no key) is allowed and binds 127.0.0.1 by default; the Docker image flips WEBCLAW_HOST=0.0.0.0 so the container is reachable out of the box. Hard caps to keep naive callers from OOMing the process: crawl capped at 500 pages synchronously, batch capped at 100 URLs / 20 concurrent. For unbounded crawls or anti-bot bypass the docs point users at the hosted API. Dockerfile + Dockerfile.ci updated to copy webclaw-server into /usr/local/bin and EXPOSE 3000. Workspace version bumped to 0.4.0 (new public binary). --- CLAUDE.md | 15 +- Cargo.lock | 130 +++++++++++++++++- Cargo.toml | 2 +- Dockerfile | 28 +++- Dockerfile.ci | 9 ++ crates/webclaw-server/Cargo.toml | 29 ++++ crates/webclaw-server/src/auth.rs | 48 +++++++ crates/webclaw-server/src/error.rs | 87 ++++++++++++ crates/webclaw-server/src/main.rs | 118 ++++++++++++++++ crates/webclaw-server/src/routes/batch.rs | 85 ++++++++++++ crates/webclaw-server/src/routes/brand.rs | 32 +++++ crates/webclaw-server/src/routes/crawl.rs | 85 ++++++++++++ crates/webclaw-server/src/routes/diff.rs | 92 +++++++++++++ crates/webclaw-server/src/routes/extract.rs | 81 +++++++++++ crates/webclaw-server/src/routes/health.rs | 10 ++ crates/webclaw-server/src/routes/map.rs | 49 +++++++ crates/webclaw-server/src/routes/mod.rs | 18 +++ crates/webclaw-server/src/routes/scrape.rs | 108 +++++++++++++++ crates/webclaw-server/src/routes/summarize.rs | 52 +++++++ crates/webclaw-server/src/state.rs | 49 +++++++ 20 files changed, 1116 insertions(+), 11 deletions(-) create mode 100644 crates/webclaw-server/Cargo.toml create mode 100644 crates/webclaw-server/src/auth.rs create mode 100644 crates/webclaw-server/src/error.rs create mode 100644 crates/webclaw-server/src/main.rs create mode 100644 crates/webclaw-server/src/routes/batch.rs create mode 100644 crates/webclaw-server/src/routes/brand.rs create mode 100644 crates/webclaw-server/src/routes/crawl.rs create mode 100644 crates/webclaw-server/src/routes/diff.rs create mode 100644 crates/webclaw-server/src/routes/extract.rs create mode 100644 crates/webclaw-server/src/routes/health.rs create mode 100644 crates/webclaw-server/src/routes/map.rs create mode 100644 crates/webclaw-server/src/routes/mod.rs create mode 100644 crates/webclaw-server/src/routes/scrape.rs create mode 100644 crates/webclaw-server/src/routes/summarize.rs create mode 100644 crates/webclaw-server/src/state.rs diff --git a/CLAUDE.md b/CLAUDE.md index ad15cf1..eac2f9f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -20,9 +20,11 @@ webclaw/ webclaw-pdf/ # PDF text extraction via pdf-extract webclaw-mcp/ # MCP server (Model Context Protocol) for AI agents webclaw-cli/ # CLI binary + webclaw-server/ # Minimal axum REST API (self-hosting; OSS counterpart + # of api.webclaw.io, without anti-bot / JS / jobs / auth) ``` -Two binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server). +Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (REST API for self-hosting). ### Core Modules (`webclaw-core`) - `extractor.rs` — Readability-style scoring: text density, semantic tags, link density penalty @@ -60,6 +62,17 @@ Two binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server). - Works with Claude Desktop, Claude Code, and any MCP client - Uses `rmcp` crate (official Rust MCP SDK) +### REST API Server (`webclaw-server`) +- Axum 0.8, stateless, no database, no job queue +- 8 POST routes + /health, JSON shapes mirror api.webclaw.io where the + capability exists in OSS +- Constant-time bearer-token auth via `subtle::ConstantTimeEq` when + `--api-key` / `WEBCLAW_API_KEY` is set; otherwise open mode +- Hard caps: crawl ≤ 500 pages, batch ≤ 100 URLs, 20 concurrent +- Does NOT include: anti-bot bypass, JS rendering, async jobs, + multi-tenant auth, billing, proxy rotation, search/research/watch/ + agent-scrape. Those live behind api.webclaw.io and are closed-source. + ## Hard Rules - **Core has ZERO network dependencies** — takes `&str` HTML, returns structured output. Keep it WASM-compatible. diff --git a/Cargo.lock b/Cargo.lock index e5c30e7..0f5fc5c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -182,6 +182,70 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "axum" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90" +dependencies = [ + "axum-core", + "axum-macros", + "bytes", + "form_urlencoded", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde_core", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "sync_wrapper", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-macros" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aa268c23bfbbd2c4363b9cd302a4f504fb2a9dfe7e3451d66f35dd392e20aca" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "base64" version = "0.22.1" @@ -1132,6 +1196,12 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + [[package]] name = "hyper" version = "1.9.0" @@ -1145,6 +1215,7 @@ dependencies = [ "http", "http-body", "httparse", + "httpdate", "itoa", "pin-project-lite", "smallvec", @@ -1559,6 +1630,12 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "matchit" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" + [[package]] name = "md-5" version = "0.10.6" @@ -1575,6 +1652,12 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -2403,6 +2486,17 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_path_to_error" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" +dependencies = [ + "itoa", + "serde", + "serde_core", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -2757,6 +2851,7 @@ dependencies = [ "tokio", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -2780,6 +2875,7 @@ dependencies = [ "tower", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -2800,6 +2896,7 @@ version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ + "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -3102,7 +3199,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.3.19" +version = "0.4.0" dependencies = [ "clap", "dotenvy", @@ -3123,7 +3220,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.3.19" +version = "0.4.0" dependencies = [ "ego-tree", "once_cell", @@ -3141,7 +3238,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.3.19" +version = "0.4.0" dependencies = [ "bytes", "calamine", @@ -3163,7 +3260,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.3.19" +version = "0.4.0" dependencies = [ "async-trait", "reqwest", @@ -3176,7 +3273,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.3.19" +version = "0.4.0" dependencies = [ "dirs", "dotenvy", @@ -3197,13 +3294,34 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.3.19" +version = "0.4.0" dependencies = [ "pdf-extract", "thiserror", "tracing", ] +[[package]] +name = "webclaw-server" +version = "0.4.0" +dependencies = [ + "anyhow", + "axum", + "clap", + "serde", + "serde_json", + "subtle", + "thiserror", + "tokio", + "tower-http", + "tracing", + "tracing-subscriber", + "webclaw-core", + "webclaw-fetch", + "webclaw-llm", + "webclaw-pdf", +] + [[package]] name = "webpki-root-certs" version = "1.0.6" diff --git a/Cargo.toml b/Cargo.toml index 41e78ac..e17d843 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.3.19" +version = "0.4.0" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/Dockerfile b/Dockerfile index 36fa67f..6f84e06 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,12 @@ # webclaw — Multi-stage Docker build -# Produces 2 binaries: webclaw (CLI) and webclaw-mcp (MCP server) +# Produces 3 binaries: +# webclaw — CLI (single-shot extraction, crawl, MCP-less use) +# webclaw-mcp — MCP server (stdio, for AI agents) +# webclaw-server — minimal REST API for self-hosting (OSS, stateless) +# +# NOTE: this is NOT the hosted API at api.webclaw.io — the cloud service +# adds anti-bot bypass, JS rendering, multi-tenant auth and async jobs +# that are intentionally not open-source. See docs/self-hosting. # --------------------------------------------------------------------------- # Stage 1: Build all binaries in release mode @@ -25,6 +32,7 @@ COPY crates/webclaw-llm/Cargo.toml crates/webclaw-llm/Cargo.toml COPY crates/webclaw-pdf/Cargo.toml crates/webclaw-pdf/Cargo.toml COPY crates/webclaw-mcp/Cargo.toml crates/webclaw-mcp/Cargo.toml COPY crates/webclaw-cli/Cargo.toml crates/webclaw-cli/Cargo.toml +COPY crates/webclaw-server/Cargo.toml crates/webclaw-server/Cargo.toml # Copy .cargo config if present (optional build flags) COPY .cargo .cargo @@ -35,7 +43,8 @@ RUN mkdir -p crates/webclaw-core/src && echo "" > crates/webclaw-core/src/lib.rs && mkdir -p crates/webclaw-llm/src && echo "" > crates/webclaw-llm/src/lib.rs \ && mkdir -p crates/webclaw-pdf/src && echo "" > crates/webclaw-pdf/src/lib.rs \ && mkdir -p crates/webclaw-mcp/src && echo "fn main() {}" > crates/webclaw-mcp/src/main.rs \ - && mkdir -p crates/webclaw-cli/src && echo "fn main() {}" > crates/webclaw-cli/src/main.rs + && mkdir -p crates/webclaw-cli/src && echo "fn main() {}" > crates/webclaw-cli/src/main.rs \ + && mkdir -p crates/webclaw-server/src && echo "fn main() {}" > crates/webclaw-server/src/main.rs # Pre-build dependencies (this layer is cached until Cargo.toml/lock changes) RUN cargo build --release 2>/dev/null || true @@ -54,9 +63,22 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ && rm -rf /var/lib/apt/lists/* -# Copy both binaries +# Copy all three binaries COPY --from=builder /build/target/release/webclaw /usr/local/bin/webclaw COPY --from=builder /build/target/release/webclaw-mcp /usr/local/bin/webclaw-mcp +COPY --from=builder /build/target/release/webclaw-server /usr/local/bin/webclaw-server + +# Default port the REST API listens on when you run `webclaw-server` inside +# the container. Override with -e WEBCLAW_PORT=... or --port. Published only +# as documentation; callers still need `-p 3000:3000` on `docker run`. +EXPOSE 3000 + +# Container default: bind all interfaces so `-p 3000:3000` works. The binary +# itself defaults to 127.0.0.1 (safe for `cargo run` on a laptop); inside +# Docker that would make the server unreachable, so we flip it here. +# Override with -e WEBCLAW_HOST=127.0.0.1 if you front this with another +# process in the same container. +ENV WEBCLAW_HOST=0.0.0.0 # Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other # commands directly so this image can be used as a FROM base with custom CMD. diff --git a/Dockerfile.ci b/Dockerfile.ci index dd1efcb..ccd8a33 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -12,6 +12,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ARG BINARY_DIR COPY ${BINARY_DIR}/webclaw /usr/local/bin/webclaw COPY ${BINARY_DIR}/webclaw-mcp /usr/local/bin/webclaw-mcp +COPY ${BINARY_DIR}/webclaw-server /usr/local/bin/webclaw-server + +# Default REST API port when running `webclaw-server` inside the container. +EXPOSE 3000 + +# Container default: bind all interfaces so `-p 3000:3000` works. The +# binary itself defaults to 127.0.0.1; flipping here keeps the CLI safe on +# a laptop but makes the container reachable out of the box. +ENV WEBCLAW_HOST=0.0.0.0 # Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other # commands directly so this image can be used as a FROM base with custom CMD. diff --git a/crates/webclaw-server/Cargo.toml b/crates/webclaw-server/Cargo.toml new file mode 100644 index 0000000..3d4c075 --- /dev/null +++ b/crates/webclaw-server/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "webclaw-server" +version.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true +description = "Minimal REST API server for self-hosting webclaw extraction. Wraps the OSS extraction crates with HTTP endpoints. NOT the production hosted API at api.webclaw.io — this is a stateless, single-binary reference server for local + self-hosted deployments." + +[[bin]] +name = "webclaw-server" +path = "src/main.rs" + +[dependencies] +webclaw-core = { workspace = true } +webclaw-fetch = { workspace = true } +webclaw-llm = { workspace = true } +webclaw-pdf = { workspace = true } + +axum = { version = "0.8", features = ["macros"] } +tokio = { workspace = true } +tower-http = { version = "0.6", features = ["trace", "cors"] } +clap = { workspace = true, features = ["derive", "env"] } +serde = { workspace = true } +serde_json = { workspace = true } +tracing = { workspace = true } +tracing-subscriber = { workspace = true, features = ["env-filter"] } +anyhow = "1" +thiserror = { workspace = true } +subtle = "2.6" diff --git a/crates/webclaw-server/src/auth.rs b/crates/webclaw-server/src/auth.rs new file mode 100644 index 0000000..390afc5 --- /dev/null +++ b/crates/webclaw-server/src/auth.rs @@ -0,0 +1,48 @@ +//! Optional bearer-token middleware. +//! +//! When the server is started without `--api-key`, every request is allowed +//! through (server runs in "open" mode — appropriate for `localhost`-only +//! deployments). When a key is configured, every `/v1/*` request must +//! present `Authorization: Bearer ` and the comparison is constant- +//! time to avoid timing-leaking the key. + +use axum::{ + extract::{Request, State}, + http::StatusCode, + middleware::Next, + response::Response, +}; +use subtle::ConstantTimeEq; + +use crate::state::AppState; + +/// Axum middleware. Mount with `axum::middleware::from_fn_with_state`. +pub async fn require_bearer( + State(state): State, + request: Request, + next: Next, +) -> Result { + let Some(expected) = state.api_key() else { + // Open mode — no key configured. Allow everything. + return Ok(next.run(request).await); + }; + + let Some(header) = request + .headers() + .get("authorization") + .and_then(|v| v.to_str().ok()) + else { + return Err(StatusCode::UNAUTHORIZED); + }; + + let presented = header + .strip_prefix("Bearer ") + .or_else(|| header.strip_prefix("bearer ")) + .ok_or(StatusCode::UNAUTHORIZED)?; + + if presented.as_bytes().ct_eq(expected.as_bytes()).into() { + Ok(next.run(request).await) + } else { + Err(StatusCode::UNAUTHORIZED) + } +} diff --git a/crates/webclaw-server/src/error.rs b/crates/webclaw-server/src/error.rs new file mode 100644 index 0000000..c49a1c9 --- /dev/null +++ b/crates/webclaw-server/src/error.rs @@ -0,0 +1,87 @@ +//! API error type. Maps internal errors to HTTP status codes + JSON. + +use axum::{ + Json, + http::StatusCode, + response::{IntoResponse, Response}, +}; +use serde_json::json; +use thiserror::Error; + +/// Public-facing API error. Always serializes as `{ "error": "..." }`. +/// Keep messages user-actionable; internal details belong in tracing logs. +/// +/// `Unauthorized` / `NotFound` / `Internal` are kept on the enum as +/// stable variants for handlers that don't exist yet (planned: per-key +/// rate-limit responses, dynamic route 404s). Marking them dead-code-OK +/// is preferable to inventing them later in three places. +#[allow(dead_code)] +#[derive(Debug, Error)] +pub enum ApiError { + #[error("{0}")] + BadRequest(String), + + #[error("unauthorized")] + Unauthorized, + + #[error("not found")] + NotFound, + + #[error("upstream fetch failed: {0}")] + Fetch(String), + + #[error("extraction failed: {0}")] + Extract(String), + + #[error("LLM provider error: {0}")] + Llm(String), + + #[error("internal: {0}")] + Internal(String), +} + +impl ApiError { + pub fn bad_request(msg: impl Into) -> Self { + Self::BadRequest(msg.into()) + } + #[allow(dead_code)] + pub fn internal(msg: impl Into) -> Self { + Self::Internal(msg.into()) + } + + fn status(&self) -> StatusCode { + match self { + Self::BadRequest(_) => StatusCode::BAD_REQUEST, + Self::Unauthorized => StatusCode::UNAUTHORIZED, + Self::NotFound => StatusCode::NOT_FOUND, + Self::Fetch(_) => StatusCode::BAD_GATEWAY, + Self::Extract(_) | Self::Llm(_) => StatusCode::UNPROCESSABLE_ENTITY, + Self::Internal(_) => StatusCode::INTERNAL_SERVER_ERROR, + } + } +} + +impl IntoResponse for ApiError { + fn into_response(self) -> Response { + let body = Json(json!({ "error": self.to_string() })); + (self.status(), body).into_response() + } +} + +impl From for ApiError { + fn from(e: webclaw_fetch::FetchError) -> Self { + Self::Fetch(e.to_string()) + } +} + +impl From for ApiError { + fn from(e: webclaw_core::ExtractError) -> Self { + Self::Extract(e.to_string()) + } +} + +impl From for ApiError { + fn from(e: webclaw_llm::LlmError) -> Self { + Self::Llm(e.to_string()) + } +} diff --git a/crates/webclaw-server/src/main.rs b/crates/webclaw-server/src/main.rs new file mode 100644 index 0000000..c57fed8 --- /dev/null +++ b/crates/webclaw-server/src/main.rs @@ -0,0 +1,118 @@ +//! webclaw-server — minimal REST API for self-hosting webclaw extraction. +//! +//! This is the OSS reference server. It is intentionally small: +//! single binary, stateless, no database, no job queue. It wraps the +//! same extraction crates the CLI and MCP server use, exposed over +//! HTTP with JSON shapes that mirror the hosted API at +//! api.webclaw.io where the underlying capability exists in OSS. +//! +//! Hosted-only features (anti-bot bypass, JS rendering, async crawl +//! jobs, multi-tenant auth, billing) are *not* implemented here and +//! never will be — they're closed-source. See the docs for the full +//! "what self-hosting gives you vs. what the cloud gives you" matrix. + +mod auth; +mod error; +mod routes; +mod state; + +use std::net::{IpAddr, SocketAddr}; +use std::time::Duration; + +use axum::{ + Router, + middleware::from_fn_with_state, + routing::{get, post}, +}; +use clap::Parser; +use tower_http::cors::{Any, CorsLayer}; +use tower_http::trace::TraceLayer; +use tracing::info; +use tracing_subscriber::{EnvFilter, fmt}; + +use crate::state::AppState; + +#[derive(Parser, Debug)] +#[command( + name = "webclaw-server", + version, + about = "Minimal self-hosted REST API for webclaw extraction.", + long_about = "Stateless single-binary REST API. Wraps the OSS extraction \ + crates over HTTP. For the full hosted platform (anti-bot, \ + JS render, async jobs, multi-tenant), use api.webclaw.io." +)] +struct Args { + /// Port to listen on. Env: WEBCLAW_PORT. + #[arg(short, long, env = "WEBCLAW_PORT", default_value_t = 3000)] + port: u16, + + /// Host to bind to. Env: WEBCLAW_HOST. + /// Default `127.0.0.1` keeps the server local-only; set to + /// `0.0.0.0` to expose on all interfaces (only do this with + /// `--api-key` set or behind a reverse proxy that adds auth). + #[arg(long, env = "WEBCLAW_HOST", default_value = "127.0.0.1")] + host: IpAddr, + + /// Optional bearer token. Env: WEBCLAW_API_KEY. When set, every + /// `/v1/*` request must present `Authorization: Bearer `. + /// When unset, the server runs in open mode (no auth) — only + /// safe on a local-bound interface or behind another auth layer. + #[arg(long, env = "WEBCLAW_API_KEY")] + api_key: Option, + + /// Tracing filter. Env: RUST_LOG. + #[arg(long, env = "RUST_LOG", default_value = "info,webclaw_server=info")] + log: String, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let args = Args::parse(); + + fmt() + .with_env_filter(EnvFilter::try_new(&args.log).unwrap_or_else(|_| EnvFilter::new("info"))) + .with_target(false) + .compact() + .init(); + + let state = AppState::new(args.api_key.clone())?; + + let v1 = Router::new() + .route("/scrape", post(routes::scrape::scrape)) + .route("/crawl", post(routes::crawl::crawl)) + .route("/map", post(routes::map::map)) + .route("/batch", post(routes::batch::batch)) + .route("/extract", post(routes::extract::extract)) + .route("/summarize", post(routes::summarize::summarize_route)) + .route("/diff", post(routes::diff::diff_route)) + .route("/brand", post(routes::brand::brand)) + .layer(from_fn_with_state(state.clone(), auth::require_bearer)); + + let app = Router::new() + .route("/health", get(routes::health::health)) + .nest("/v1", v1) + .layer( + // Permissive CORS — same posture as a self-hosted dev tool. + // Tighten in front with a reverse proxy if you expose this + // publicly. + CorsLayer::new() + .allow_origin(Any) + .allow_methods(Any) + .allow_headers(Any) + .max_age(Duration::from_secs(3600)), + ) + .layer(TraceLayer::new_for_http()) + .with_state(state); + + let addr = SocketAddr::from((args.host, args.port)); + let listener = tokio::net::TcpListener::bind(addr).await?; + let auth_status = if args.api_key.is_some() { + "bearer auth required" + } else { + "open mode (no auth)" + }; + info!(%addr, mode = auth_status, "webclaw-server listening"); + + axum::serve(listener, app).await?; + Ok(()) +} diff --git a/crates/webclaw-server/src/routes/batch.rs b/crates/webclaw-server/src/routes/batch.rs new file mode 100644 index 0000000..99533c9 --- /dev/null +++ b/crates/webclaw-server/src/routes/batch.rs @@ -0,0 +1,85 @@ +//! POST /v1/batch — fetch + extract many URLs in parallel. +//! +//! `concurrency` is hard-capped at 20 to avoid hammering targets and +//! to bound memory growth for naive callers. For larger batches use +//! the hosted API. + +use axum::{Json, extract::State}; +use serde::Deserialize; +use serde_json::{Value, json}; +use webclaw_core::ExtractionOptions; + +use crate::{error::ApiError, state::AppState}; + +const HARD_MAX_URLS: usize = 100; +const HARD_MAX_CONCURRENCY: usize = 20; + +#[derive(Debug, Deserialize, Default)] +#[serde(default)] +pub struct BatchRequest { + pub urls: Vec, + pub concurrency: Option, + pub include_selectors: Vec, + pub exclude_selectors: Vec, + pub only_main_content: bool, +} + +pub async fn batch( + State(state): State, + Json(req): Json, +) -> Result, ApiError> { + if req.urls.is_empty() { + return Err(ApiError::bad_request("`urls` is required")); + } + if req.urls.len() > HARD_MAX_URLS { + return Err(ApiError::bad_request(format!( + "too many urls: {} (max {HARD_MAX_URLS})", + req.urls.len() + ))); + } + + let concurrency = req.concurrency.unwrap_or(5).clamp(1, HARD_MAX_CONCURRENCY); + + let options = ExtractionOptions { + include_selectors: req.include_selectors, + exclude_selectors: req.exclude_selectors, + only_main_content: req.only_main_content, + include_raw_html: false, + }; + + let url_refs: Vec<&str> = req.urls.iter().map(|s| s.as_str()).collect(); + let results = state + .fetch() + .fetch_and_extract_batch_with_options(&url_refs, concurrency, &options) + .await; + + let mut ok = 0usize; + let mut errors = 0usize; + let mut out: Vec = Vec::with_capacity(results.len()); + for r in results { + match r.result { + Ok(extraction) => { + ok += 1; + out.push(json!({ + "url": r.url, + "metadata": extraction.metadata, + "markdown": extraction.content.markdown, + })); + } + Err(e) => { + errors += 1; + out.push(json!({ + "url": r.url, + "error": e.to_string(), + })); + } + } + } + + Ok(Json(json!({ + "total": out.len(), + "completed": ok, + "errors": errors, + "results": out, + }))) +} diff --git a/crates/webclaw-server/src/routes/brand.rs b/crates/webclaw-server/src/routes/brand.rs new file mode 100644 index 0000000..908976a --- /dev/null +++ b/crates/webclaw-server/src/routes/brand.rs @@ -0,0 +1,32 @@ +//! POST /v1/brand — extract brand identity (colors, fonts, logo) from a page. +//! +//! Pure DOM/CSS analysis — no LLM, no network beyond the page fetch itself. + +use axum::{Json, extract::State}; +use serde::Deserialize; +use serde_json::{Value, json}; +use webclaw_core::brand::extract_brand; + +use crate::{error::ApiError, state::AppState}; + +#[derive(Debug, Deserialize)] +pub struct BrandRequest { + pub url: String, +} + +pub async fn brand( + State(state): State, + Json(req): Json, +) -> Result, ApiError> { + if req.url.trim().is_empty() { + return Err(ApiError::bad_request("`url` is required")); + } + + let fetched = state.fetch().fetch(&req.url).await?; + let brand = extract_brand(&fetched.html, Some(&fetched.url)); + + Ok(Json(json!({ + "url": req.url, + "brand": brand, + }))) +} diff --git a/crates/webclaw-server/src/routes/crawl.rs b/crates/webclaw-server/src/routes/crawl.rs new file mode 100644 index 0000000..4d15195 --- /dev/null +++ b/crates/webclaw-server/src/routes/crawl.rs @@ -0,0 +1,85 @@ +//! POST /v1/crawl — synchronous BFS crawl. +//! +//! NOTE: this server is stateless — there is no job queue. Crawls run +//! inline and return when complete. `max_pages` is hard-capped at 500 +//! to avoid OOM on naive callers. For large crawls + async jobs, use +//! the hosted API at api.webclaw.io. + +use axum::{Json, extract::State}; +use serde::Deserialize; +use serde_json::{Value, json}; +use std::time::Duration; +use webclaw_fetch::{CrawlConfig, Crawler, FetchConfig}; + +use crate::{error::ApiError, state::AppState}; + +const HARD_MAX_PAGES: usize = 500; + +#[derive(Debug, Deserialize, Default)] +#[serde(default)] +pub struct CrawlRequest { + pub url: String, + pub max_depth: Option, + pub max_pages: Option, + pub use_sitemap: bool, + pub concurrency: Option, + pub allow_subdomains: bool, + pub allow_external_links: bool, + pub include_patterns: Vec, + pub exclude_patterns: Vec, +} + +pub async fn crawl( + State(_state): State, + Json(req): Json, +) -> Result, ApiError> { + if req.url.trim().is_empty() { + return Err(ApiError::bad_request("`url` is required")); + } + let max_pages = req.max_pages.unwrap_or(50).min(HARD_MAX_PAGES); + let max_depth = req.max_depth.unwrap_or(3); + let concurrency = req.concurrency.unwrap_or(5).min(20); + + let config = CrawlConfig { + fetch: FetchConfig::default(), + max_depth, + max_pages, + concurrency, + delay: Duration::from_millis(200), + path_prefix: None, + use_sitemap: req.use_sitemap, + include_patterns: req.include_patterns, + exclude_patterns: req.exclude_patterns, + allow_subdomains: req.allow_subdomains, + allow_external_links: req.allow_external_links, + progress_tx: None, + cancel_flag: None, + }; + + let crawler = Crawler::new(&req.url, config).map_err(ApiError::from)?; + let result = crawler.crawl(&req.url, None).await; + + let pages: Vec = result + .pages + .iter() + .map(|p| { + json!({ + "url": p.url, + "depth": p.depth, + "metadata": p.extraction.as_ref().map(|e| &e.metadata), + "markdown": p.extraction.as_ref().map(|e| e.content.markdown.as_str()).unwrap_or(""), + "error": p.error, + }) + }) + .collect(); + + Ok(Json(json!({ + "url": req.url, + "status": "completed", + "total": result.total, + "completed": result.ok, + "errors": result.errors, + "elapsed_secs": result.elapsed_secs, + "pages": pages, + }))) +} diff --git a/crates/webclaw-server/src/routes/diff.rs b/crates/webclaw-server/src/routes/diff.rs new file mode 100644 index 0000000..e4e038d --- /dev/null +++ b/crates/webclaw-server/src/routes/diff.rs @@ -0,0 +1,92 @@ +//! POST /v1/diff — compare current page content against a prior snapshot. +//! +//! Caller passes either a full prior `ExtractionResult` or the minimal +//! `{ markdown, metadata }` shape used by the hosted API. We re-fetch +//! the URL, extract, and run `webclaw_core::diff::diff` over the pair. + +use axum::{Json, extract::State}; +use serde::Deserialize; +use serde_json::{Value, json}; +use webclaw_core::{Content, ExtractionResult, Metadata, diff::diff}; + +use crate::{error::ApiError, state::AppState}; + +#[derive(Debug, Deserialize)] +pub struct DiffRequest { + pub url: String, + pub previous: PreviousSnapshot, +} + +/// Either a full prior extraction, or the minimal `{ markdown, metadata }` +/// shape returned by /v1/scrape. Untagged so callers can send whichever +/// they have on hand. +#[derive(Debug, Deserialize)] +#[serde(untagged)] +pub enum PreviousSnapshot { + Full(ExtractionResult), + Minimal { + #[serde(default)] + markdown: String, + #[serde(default)] + metadata: Option, + }, +} + +impl PreviousSnapshot { + fn into_extraction(self) -> ExtractionResult { + match self { + Self::Full(r) => r, + Self::Minimal { markdown, metadata } => ExtractionResult { + metadata: metadata.unwrap_or_else(empty_metadata), + content: Content { + markdown, + plain_text: String::new(), + links: Vec::new(), + images: Vec::new(), + code_blocks: Vec::new(), + raw_html: None, + }, + domain_data: None, + structured_data: Vec::new(), + }, + } + } +} + +fn empty_metadata() -> Metadata { + Metadata { + title: None, + description: None, + author: None, + published_date: None, + language: None, + url: None, + site_name: None, + image: None, + favicon: None, + word_count: 0, + } +} + +pub async fn diff_route( + State(state): State, + Json(req): Json, +) -> Result, ApiError> { + if req.url.trim().is_empty() { + return Err(ApiError::bad_request("`url` is required")); + } + + let current = state.fetch().fetch_and_extract(&req.url).await?; + let previous = req.previous.into_extraction(); + let result = diff(&previous, ¤t); + + Ok(Json(json!({ + "url": req.url, + "status": result.status, + "diff": result.text_diff, + "metadata_changes": result.metadata_changes, + "links_added": result.links_added, + "links_removed": result.links_removed, + "word_count_delta": result.word_count_delta, + }))) +} diff --git a/crates/webclaw-server/src/routes/extract.rs b/crates/webclaw-server/src/routes/extract.rs new file mode 100644 index 0000000..05b8909 --- /dev/null +++ b/crates/webclaw-server/src/routes/extract.rs @@ -0,0 +1,81 @@ +//! POST /v1/extract — LLM-powered structured extraction. +//! +//! Two modes: +//! * `schema` — JSON Schema describing what to extract. +//! * `prompt` — natural-language instructions. +//! +//! At least one must be provided. The provider chain is built per +//! request from env (Ollama -> OpenAI -> Anthropic). Self-hosters +//! get the same fallback behaviour as the CLI. + +use axum::{Json, extract::State}; +use serde::Deserialize; +use serde_json::{Value, json}; +use webclaw_llm::{ProviderChain, extract::extract_json, extract::extract_with_prompt}; + +use crate::{error::ApiError, state::AppState}; + +#[derive(Debug, Deserialize, Default)] +#[serde(default)] +pub struct ExtractRequest { + pub url: String, + pub schema: Option, + pub prompt: Option, + /// Optional override of the provider model name (e.g. `gpt-4o-mini`). + pub model: Option, +} + +pub async fn extract( + State(state): State, + Json(req): Json, +) -> Result, ApiError> { + if req.url.trim().is_empty() { + return Err(ApiError::bad_request("`url` is required")); + } + let has_schema = req.schema.is_some(); + let has_prompt = req + .prompt + .as_deref() + .map(|p| !p.trim().is_empty()) + .unwrap_or(false); + if !has_schema && !has_prompt { + return Err(ApiError::bad_request( + "either `schema` or `prompt` is required", + )); + } + + // Fetch + extract first so we feed the LLM clean markdown instead of + // raw HTML. Cheaper tokens, better signal. + let extraction = state.fetch().fetch_and_extract(&req.url).await?; + let content = if extraction.content.markdown.trim().is_empty() { + extraction.content.plain_text.clone() + } else { + extraction.content.markdown.clone() + }; + if content.trim().is_empty() { + return Err(ApiError::Extract( + "no extractable content on page".to_string(), + )); + } + + let chain = ProviderChain::default().await; + if chain.is_empty() { + return Err(ApiError::Llm( + "no LLM providers configured (set OLLAMA_HOST, OPENAI_API_KEY, or ANTHROPIC_API_KEY)" + .to_string(), + )); + } + + let model = req.model.as_deref(); + let data = if let Some(schema) = req.schema.as_ref() { + extract_json(&content, schema, &chain, model).await? + } else { + let prompt = req.prompt.as_deref().unwrap_or_default(); + extract_with_prompt(&content, prompt, &chain, model).await? + }; + + Ok(Json(json!({ + "url": req.url, + "data": data, + }))) +} diff --git a/crates/webclaw-server/src/routes/health.rs b/crates/webclaw-server/src/routes/health.rs new file mode 100644 index 0000000..7ccd165 --- /dev/null +++ b/crates/webclaw-server/src/routes/health.rs @@ -0,0 +1,10 @@ +use axum::Json; +use serde_json::{Value, json}; + +pub async fn health() -> Json { + Json(json!({ + "status": "ok", + "version": env!("CARGO_PKG_VERSION"), + "service": "webclaw-server", + })) +} diff --git a/crates/webclaw-server/src/routes/map.rs b/crates/webclaw-server/src/routes/map.rs new file mode 100644 index 0000000..846183a --- /dev/null +++ b/crates/webclaw-server/src/routes/map.rs @@ -0,0 +1,49 @@ +//! POST /v1/map — discover URLs from a site's sitemaps. +//! +//! Walks robots.txt + common sitemap paths, recursively resolves +//! `` files, and returns the deduplicated list of URLs. + +use axum::{Json, extract::State}; +use serde::Deserialize; +use serde_json::{Value, json}; +use webclaw_fetch::sitemap; + +use crate::{error::ApiError, state::AppState}; + +#[derive(Debug, Deserialize)] +pub struct MapRequest { + pub url: String, + /// When true, return the full SitemapEntry objects (with lastmod, + /// priority, changefreq). Defaults to false → bare URL strings, + /// matching the hosted-API shape. + #[serde(default)] + pub include_metadata: bool, +} + +pub async fn map( + State(state): State, + Json(req): Json, +) -> Result, ApiError> { + if req.url.trim().is_empty() { + return Err(ApiError::bad_request("`url` is required")); + } + + let entries = sitemap::discover(state.fetch(), &req.url).await?; + + let body = if req.include_metadata { + json!({ + "url": req.url, + "count": entries.len(), + "urls": entries, + }) + } else { + let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect(); + json!({ + "url": req.url, + "count": urls.len(), + "urls": urls, + }) + }; + + Ok(Json(body)) +} diff --git a/crates/webclaw-server/src/routes/mod.rs b/crates/webclaw-server/src/routes/mod.rs new file mode 100644 index 0000000..7c3d68e --- /dev/null +++ b/crates/webclaw-server/src/routes/mod.rs @@ -0,0 +1,18 @@ +//! HTTP route handlers. +//! +//! The OSS server exposes a deliberately small surface that mirrors the +//! hosted-API JSON shapes where the underlying capability exists in the +//! OSS crates. Endpoints that depend on private infrastructure +//! (anti-bot bypass with stealth Chrome, JS rendering at scale, +//! per-user auth, billing, async job queues, agent loops) are +//! intentionally not implemented here. Use api.webclaw.io for those. + +pub mod batch; +pub mod brand; +pub mod crawl; +pub mod diff; +pub mod extract; +pub mod health; +pub mod map; +pub mod scrape; +pub mod summarize; diff --git a/crates/webclaw-server/src/routes/scrape.rs b/crates/webclaw-server/src/routes/scrape.rs new file mode 100644 index 0000000..1c5fc52 --- /dev/null +++ b/crates/webclaw-server/src/routes/scrape.rs @@ -0,0 +1,108 @@ +//! POST /v1/scrape — fetch a URL, run extraction, return the requested +//! formats. JSON shape mirrors the hosted-API response where possible so +//! migrating from self-hosted → cloud is a config change, not a code one. + +use axum::{Json, extract::State}; +use serde::Deserialize; +use serde_json::{Value, json}; +use webclaw_core::{ExtractionOptions, llm::to_llm_text}; + +use crate::{error::ApiError, state::AppState}; + +#[derive(Debug, Deserialize, Default)] +#[serde(default)] +pub struct ScrapeRequest { + pub url: String, + /// Output formats. Allowed: "markdown", "text", "llm", "json", "html". + /// Defaults to ["markdown"]. Accepts a single string ("format") + /// or an array ("formats") for hosted-API compatibility. + #[serde(alias = "format")] + pub formats: ScrapeFormats, + pub include_selectors: Vec, + pub exclude_selectors: Vec, + pub only_main_content: bool, +} + +#[derive(Debug, Deserialize)] +#[serde(untagged)] +pub enum ScrapeFormats { + One(String), + Many(Vec), +} + +impl Default for ScrapeFormats { + fn default() -> Self { + Self::Many(vec!["markdown".into()]) + } +} + +impl ScrapeFormats { + fn as_vec(&self) -> Vec { + match self { + Self::One(s) => vec![s.clone()], + Self::Many(v) => v.clone(), + } + } +} + +pub async fn scrape( + State(state): State, + Json(req): Json, +) -> Result, ApiError> { + if req.url.trim().is_empty() { + return Err(ApiError::bad_request("`url` is required")); + } + let formats = req.formats.as_vec(); + + let options = ExtractionOptions { + include_selectors: req.include_selectors, + exclude_selectors: req.exclude_selectors, + only_main_content: req.only_main_content, + include_raw_html: formats.iter().any(|f| f == "html"), + }; + + let extraction = state + .fetch() + .fetch_and_extract_with_options(&req.url, &options) + .await?; + + let mut body = json!({ + "url": extraction.metadata.url.clone().unwrap_or_else(|| req.url.clone()), + "metadata": extraction.metadata, + }); + let obj = body.as_object_mut().expect("json::object"); + + for f in &formats { + match f.as_str() { + "markdown" => { + obj.insert("markdown".into(), json!(extraction.content.markdown)); + } + "text" => { + obj.insert("text".into(), json!(extraction.content.plain_text)); + } + "llm" => { + let llm = to_llm_text(&extraction, extraction.metadata.url.as_deref()); + obj.insert("llm".into(), json!(llm)); + } + "html" => { + if let Some(raw) = &extraction.content.raw_html { + obj.insert("html".into(), json!(raw)); + } + } + "json" => { + obj.insert("json".into(), json!(extraction)); + } + other => { + return Err(ApiError::bad_request(format!( + "unknown format: '{other}' (allowed: markdown, text, llm, html, json)" + ))); + } + } + } + + if !extraction.structured_data.is_empty() { + obj.insert("structured_data".into(), json!(extraction.structured_data)); + } + + Ok(Json(body)) +} diff --git a/crates/webclaw-server/src/routes/summarize.rs b/crates/webclaw-server/src/routes/summarize.rs new file mode 100644 index 0000000..b967f1f --- /dev/null +++ b/crates/webclaw-server/src/routes/summarize.rs @@ -0,0 +1,52 @@ +//! POST /v1/summarize — LLM-powered page summary. + +use axum::{Json, extract::State}; +use serde::Deserialize; +use serde_json::{Value, json}; +use webclaw_llm::{ProviderChain, summarize::summarize}; + +use crate::{error::ApiError, state::AppState}; + +#[derive(Debug, Deserialize, Default)] +#[serde(default)] +pub struct SummarizeRequest { + pub url: String, + pub max_sentences: Option, + pub model: Option, +} + +pub async fn summarize_route( + State(state): State, + Json(req): Json, +) -> Result, ApiError> { + if req.url.trim().is_empty() { + return Err(ApiError::bad_request("`url` is required")); + } + + let extraction = state.fetch().fetch_and_extract(&req.url).await?; + let content = if extraction.content.markdown.trim().is_empty() { + extraction.content.plain_text.clone() + } else { + extraction.content.markdown.clone() + }; + if content.trim().is_empty() { + return Err(ApiError::Extract( + "no extractable content on page".to_string(), + )); + } + + let chain = ProviderChain::default().await; + if chain.is_empty() { + return Err(ApiError::Llm( + "no LLM providers configured (set OLLAMA_HOST, OPENAI_API_KEY, or ANTHROPIC_API_KEY)" + .to_string(), + )); + } + + let summary = summarize(&content, req.max_sentences, &chain, req.model.as_deref()).await?; + + Ok(Json(json!({ + "url": req.url, + "summary": summary, + }))) +} diff --git a/crates/webclaw-server/src/state.rs b/crates/webclaw-server/src/state.rs new file mode 100644 index 0000000..b3f9b6b --- /dev/null +++ b/crates/webclaw-server/src/state.rs @@ -0,0 +1,49 @@ +//! Shared application state. Cheap to clone via Arc; held by the axum +//! Router for the life of the process. + +use std::sync::Arc; +use webclaw_fetch::{BrowserProfile, FetchClient, FetchConfig}; + +/// Single-process state shared across all request handlers. +#[derive(Clone)] +pub struct AppState { + inner: Arc, +} + +struct Inner { + /// Wrapped in `Arc` because `fetch_and_extract_batch_with_options` + /// (used by the /v1/batch handler) takes `self: &Arc` so it + /// can clone the client into spawned tasks. The single-call handlers + /// auto-deref `&Arc` -> `&FetchClient`, so this costs + /// them nothing. + pub fetch: Arc, + pub api_key: Option, +} + +impl AppState { + /// Build the application state. The fetch client is constructed once + /// and shared across requests so connection pools + browser profile + /// state don't churn per request. + pub fn new(api_key: Option) -> anyhow::Result { + let config = FetchConfig { + browser: BrowserProfile::Chrome, + ..FetchConfig::default() + }; + let fetch = FetchClient::new(config) + .map_err(|e| anyhow::anyhow!("failed to build fetch client: {e}"))?; + Ok(Self { + inner: Arc::new(Inner { + fetch: Arc::new(fetch), + api_key, + }), + }) + } + + pub fn fetch(&self) -> &Arc { + &self.inner.fetch + } + + pub fn api_key(&self) -> Option<&str> { + self.inner.api_key.as_deref() + } +} From d91ad9c1f4ca1ff7456a5e4c6a2ff7aa76efdab2 Mon Sep 17 00:00:00 2001 From: Valerio Date: Wed, 22 Apr 2026 12:25:29 +0200 Subject: [PATCH 02/30] feat(cli): add webclaw bench subcommand (closes #26) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per-URL extraction micro-benchmark. Fetches a URL once, runs the same pipeline as --format llm, prints a small ASCII table comparing raw HTML vs. llm output on tokens, bytes, and extraction time. webclaw bench https://stripe.com # ASCII table webclaw bench https://stripe.com --json # one-line JSON webclaw bench https://stripe.com --facts FILE # adds fidelity row The --facts file uses the same schema as benchmarks/facts.json (curated visible-fact list per URL). URLs not in the file produce no fidelity row, so an uncurated site doesn't show 0/0. v1 uses an approximate tokenizer (chars/4 Latin, chars/2 when CJK dominates). Off by ~10% vs cl100k_base but the signal — 'is the LLM output 90% smaller than the raw HTML' — is order-of-magnitude, not precise accounting. Output is labeled '~ tokens' so nobody mistakes it for a real BPE count. Swapping in tiktoken-rs later is a one function change; left out of v1 to avoid the 2 MB BPE-data binary bloat for a feature most users will run a handful of times. Implemented as a real clap subcommand (clap::Subcommand) rather than yet another flag, with the existing flag-based flow falling through when no subcommand is given. Existing 'webclaw --format ...' invocations work exactly as before. Lays the groundwork for future subcommands without disrupting the legacy flat-flag UX. 12 new unit tests cover the tokenizer, formatters, host extraction, and fact-matching. Verified end-to-end on example.com and tavily.com (5/5 facts preserved at 93% token reduction). --- crates/webclaw-cli/src/bench.rs | 422 ++++++++++++++++++++++++++++++++ crates/webclaw-cli/src/main.rs | 50 +++- 2 files changed, 471 insertions(+), 1 deletion(-) create mode 100644 crates/webclaw-cli/src/bench.rs diff --git a/crates/webclaw-cli/src/bench.rs b/crates/webclaw-cli/src/bench.rs new file mode 100644 index 0000000..3e45da4 --- /dev/null +++ b/crates/webclaw-cli/src/bench.rs @@ -0,0 +1,422 @@ +//! `webclaw bench ` — per-URL extraction micro-benchmark. +//! +//! Fetches a page, extracts it via the same pipeline that powers +//! `--format llm`, and reports how many tokens the LLM pipeline +//! removed vs. the raw HTML. Optional `--facts` reuses the +//! benchmark harness's curated fact lists to score fidelity. +//! +//! v1 uses an *approximate* tokenizer (chars/4 for Latin text, +//! chars/2 for CJK-heavy text). Output is clearly labeled +//! "≈ tokens" so nobody mistakes it for a real tiktoken run. +//! Swapping to tiktoken-rs later is a one-function change. + +use std::path::{Path, PathBuf}; +use std::time::Instant; + +use webclaw_core::{extract, to_llm_text}; +use webclaw_fetch::{BrowserProfile, FetchClient, FetchConfig}; + +/// Inputs collected from the clap subcommand. +pub struct BenchArgs { + pub url: String, + pub json: bool, + pub facts: Option, +} + +/// What a single bench run measures. +struct BenchResult { + url: String, + raw_tokens: usize, + raw_bytes: usize, + llm_tokens: usize, + llm_bytes: usize, + reduction_pct: f64, + elapsed_secs: f64, + /// `Some((found, total))` when `--facts` is supplied and the URL has + /// an entry in the facts file; `None` otherwise. + facts: Option<(usize, usize)>, +} + +pub async fn run(args: &BenchArgs) -> Result<(), String> { + // Dedicated client so bench doesn't care about global CLI flags + // (proxies, custom headers, etc.). A reproducible microbench is + // more useful than an over-configurable one; if someone wants to + // bench behind a proxy they can set WEBCLAW_PROXY — respected + // by FetchConfig via the regular channels if we extend later. + let config = FetchConfig { + browser: BrowserProfile::Chrome, + ..FetchConfig::default() + }; + let client = FetchClient::new(config).map_err(|e| format!("build client: {e}"))?; + + let start = Instant::now(); + let fetched = client + .fetch(&args.url) + .await + .map_err(|e| format!("fetch: {e}"))?; + + let extraction = + extract(&fetched.html, Some(&fetched.url)).map_err(|e| format!("extract: {e}"))?; + let llm_text = to_llm_text(&extraction, Some(&fetched.url)); + let elapsed = start.elapsed(); + + let raw_tokens = approx_tokens(&fetched.html); + let llm_tokens = approx_tokens(&llm_text); + let raw_bytes = fetched.html.len(); + let llm_bytes = llm_text.len(); + let reduction_pct = if raw_tokens == 0 { + 0.0 + } else { + 100.0 * (1.0 - llm_tokens as f64 / raw_tokens as f64) + }; + + let facts = match args.facts.as_deref() { + Some(path) => check_facts(path, &args.url, &llm_text)?, + None => None, + }; + + let result = BenchResult { + url: args.url.clone(), + raw_tokens, + raw_bytes, + llm_tokens, + llm_bytes, + reduction_pct, + elapsed_secs: elapsed.as_secs_f64(), + facts, + }; + + if args.json { + print_json(&result); + } else { + print_box(&result); + } + Ok(()) +} + +// --------------------------------------------------------------------------- +// Approximate tokenizer +// --------------------------------------------------------------------------- + +/// Rough token count. `chars / 4` is the classic English rule of thumb +/// (close to cl100k_base for typical prose). CJK scripts pack ~2 chars +/// per token, so we switch to `chars / 2` when CJK dominates. +/// +/// Off by ±10% vs. a real BPE tokenizer, which is fine for "is webclaw's +/// output 66% smaller or 66% bigger than raw HTML" — the signal is +/// order-of-magnitude, not precise accounting. +fn approx_tokens(s: &str) -> usize { + let total: usize = s.chars().count(); + if total == 0 { + return 0; + } + let cjk = s.chars().filter(|c| is_cjk(*c)).count(); + let cjk_ratio = cjk as f64 / total as f64; + if cjk_ratio > 0.30 { + total.div_ceil(2) + } else { + total.div_ceil(4) + } +} + +fn is_cjk(c: char) -> bool { + let n = c as u32; + (0x4E00..=0x9FFF).contains(&n) // CJK Unified Ideographs + || (0x3040..=0x309F).contains(&n) // Hiragana + || (0x30A0..=0x30FF).contains(&n) // Katakana + || (0xAC00..=0xD7AF).contains(&n) // Hangul Syllables + || (0x3400..=0x4DBF).contains(&n) // CJK Extension A +} + +// --------------------------------------------------------------------------- +// Output: ASCII / Unicode box +// --------------------------------------------------------------------------- + +const BOX_WIDTH: usize = 62; // inner width between the two side borders + +fn print_box(r: &BenchResult) { + let host = display_host(&r.url); + let version = env!("CARGO_PKG_VERSION"); + + let top = "─".repeat(BOX_WIDTH); + let sep = "─".repeat(BOX_WIDTH); + + // Header: host on the left, "webclaw X.Y.Z" on the right. + let left = host; + let right = format!("webclaw {version}"); + let pad = BOX_WIDTH.saturating_sub(left.chars().count() + right.chars().count() + 2); + let header = format!(" {}{}{} ", left, " ".repeat(pad), right); + + println!("┌{top}┐"); + println!("│{header}│"); + println!("├{sep}┤"); + print_row( + "raw HTML", + &format!("{} ≈ tokens", fmt_int(r.raw_tokens)), + &fmt_bytes(r.raw_bytes), + ); + print_row( + "--format llm", + &format!("{} ≈ tokens", fmt_int(r.llm_tokens)), + &fmt_bytes(r.llm_bytes), + ); + print_row("token reduction", &format!("{:.1}%", r.reduction_pct), ""); + print_row("extraction time", &format!("{:.2} s", r.elapsed_secs), ""); + if let Some((found, total)) = r.facts { + let pct = if total == 0 { + 0.0 + } else { + 100.0 * found as f64 / total as f64 + }; + print_row( + "facts preserved", + &format!("{found}/{total} ({pct:.1}%)"), + "", + ); + } + println!("└{top}┘"); + println!(); + println!("note: token counts are approximate (chars/4 Latin, chars/2 CJK)."); +} + +fn print_row(label: &str, middle: &str, right: &str) { + // Layout inside the box: + // "