//! webclaw-server — minimal REST API for self-hosting webclaw extraction. //! //! This is the OSS reference server. It is intentionally small: //! single binary, stateless, no database, no job queue. It wraps the //! same extraction crates the CLI and MCP server use, exposed over //! HTTP with JSON shapes that mirror the hosted API at //! api.webclaw.io where the underlying capability exists in OSS. //! //! Hosted-only features (anti-bot bypass, JS rendering, async crawl //! jobs, multi-tenant auth, billing) are *not* implemented here and //! never will be — they're closed-source. See the docs for the full //! "what self-hosting gives you vs. what the cloud gives you" matrix. mod auth; mod error; mod routes; mod state; use std::net::{IpAddr, SocketAddr}; use std::time::Duration; use axum::{ Router, middleware::from_fn_with_state, routing::{get, post}, }; use clap::Parser; use tower_http::cors::{Any, CorsLayer}; use tower_http::trace::TraceLayer; use tracing::info; use tracing_subscriber::{EnvFilter, fmt}; use crate::state::AppState; #[derive(Parser, Debug)] #[command( name = "webclaw-server", version, about = "Minimal self-hosted REST API for webclaw extraction.", long_about = "Stateless single-binary REST API. Wraps the OSS extraction \ crates over HTTP. For the full hosted platform (anti-bot, \ JS render, async jobs, multi-tenant), use api.webclaw.io." )] struct Args { /// Port to listen on. Env: WEBCLAW_PORT. #[arg(short, long, env = "WEBCLAW_PORT", default_value_t = 3000)] port: u16, /// Host to bind to. Env: WEBCLAW_HOST. /// Default `127.0.0.1` keeps the server local-only; set to /// `0.0.0.0` to expose on all interfaces (only do this with /// `--api-key` set or behind a reverse proxy that adds auth). #[arg(long, env = "WEBCLAW_HOST", default_value = "127.0.0.1")] host: IpAddr, /// Optional bearer token. Env: WEBCLAW_API_KEY. When set, every /// `/v1/*` request must present `Authorization: Bearer `. /// When unset, the server runs in open mode (no auth) — only /// safe on a local-bound interface or behind another auth layer. #[arg(long, env = "WEBCLAW_API_KEY")] api_key: Option, /// Tracing filter. Env: RUST_LOG. #[arg(long, env = "RUST_LOG", default_value = "info,webclaw_server=info")] log: String, } #[tokio::main] async fn main() -> anyhow::Result<()> { let args = Args::parse(); fmt() .with_env_filter(EnvFilter::try_new(&args.log).unwrap_or_else(|_| EnvFilter::new("info"))) .with_target(false) .compact() .init(); let state = AppState::new(args.api_key.clone())?; let v1 = Router::new() .route("/scrape", post(routes::scrape::scrape)) .route( "/scrape/{vertical}", post(routes::structured::scrape_vertical), ) .route("/crawl", post(routes::crawl::crawl)) .route("/map", post(routes::map::map)) .route("/batch", post(routes::batch::batch)) .route("/extract", post(routes::extract::extract)) .route("/extractors", get(routes::structured::list_extractors)) .route("/summarize", post(routes::summarize::summarize_route)) .route("/diff", post(routes::diff::diff_route)) .route("/brand", post(routes::brand::brand)) .layer(from_fn_with_state(state.clone(), auth::require_bearer)); let app = Router::new() .route("/health", get(routes::health::health)) .nest("/v1", v1) .layer( // Permissive CORS — same posture as a self-hosted dev tool. // Tighten in front with a reverse proxy if you expose this // publicly. CorsLayer::new() .allow_origin(Any) .allow_methods(Any) .allow_headers(Any) .max_age(Duration::from_secs(3600)), ) .layer(TraceLayer::new_for_http()) .with_state(state); let addr = SocketAddr::from((args.host, args.port)); let listener = tokio::net::TcpListener::bind(addr).await?; let auth_status = if args.api_key.is_some() { "bearer auth required" } else { "open mode (no auth)" }; info!(%addr, mode = auth_status, "webclaw-server listening"); axum::serve(listener, app).await?; Ok(()) }