mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-24 02:58:05 +02:00
feat(server): add OSS webclaw-server REST API binary (closes #29)
Self-hosters hitting docs/self-hosting were promised three binaries
but the OSS Docker image only shipped two. webclaw-server lived in
the closed-source hosted-platform repo, which couldn't be opened. This
adds a minimal axum REST API in the OSS repo so self-hosting actually
works without pretending to ship the cloud platform.
Crate at crates/webclaw-server/. Stateless, no database, no job queue,
single binary. Endpoints: GET /health, POST /v1/{scrape, crawl, map,
batch, extract, summarize, diff, brand}. JSON shapes mirror
api.webclaw.io for the endpoints OSS can support, so swapping between
self-hosted and hosted is a base-URL change.
Auth: optional bearer token via WEBCLAW_API_KEY / --api-key. Comparison
is constant-time (subtle::ConstantTimeEq). Open mode (no key) is
allowed and binds 127.0.0.1 by default; the Docker image flips
WEBCLAW_HOST=0.0.0.0 so the container is reachable out of the box.
Hard caps to keep naive callers from OOMing the process: crawl capped
at 500 pages synchronously, batch capped at 100 URLs / 20 concurrent.
For unbounded crawls or anti-bot bypass the docs point users at the
hosted API.
Dockerfile + Dockerfile.ci updated to copy webclaw-server into
/usr/local/bin and EXPOSE 3000. Workspace version bumped to 0.4.0
(new public binary).
This commit is contained in:
parent
b4bfff120e
commit
2ba682adf3
20 changed files with 1116 additions and 11 deletions
118
crates/webclaw-server/src/main.rs
Normal file
118
crates/webclaw-server/src/main.rs
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
//! webclaw-server — minimal REST API for self-hosting webclaw extraction.
|
||||
//!
|
||||
//! This is the OSS reference server. It is intentionally small:
|
||||
//! single binary, stateless, no database, no job queue. It wraps the
|
||||
//! same extraction crates the CLI and MCP server use, exposed over
|
||||
//! HTTP with JSON shapes that mirror the hosted API at
|
||||
//! api.webclaw.io where the underlying capability exists in OSS.
|
||||
//!
|
||||
//! Hosted-only features (anti-bot bypass, JS rendering, async crawl
|
||||
//! jobs, multi-tenant auth, billing) are *not* implemented here and
|
||||
//! never will be — they're closed-source. See the docs for the full
|
||||
//! "what self-hosting gives you vs. what the cloud gives you" matrix.
|
||||
|
||||
mod auth;
|
||||
mod error;
|
||||
mod routes;
|
||||
mod state;
|
||||
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
use std::time::Duration;
|
||||
|
||||
use axum::{
|
||||
Router,
|
||||
middleware::from_fn_with_state,
|
||||
routing::{get, post},
|
||||
};
|
||||
use clap::Parser;
|
||||
use tower_http::cors::{Any, CorsLayer};
|
||||
use tower_http::trace::TraceLayer;
|
||||
use tracing::info;
|
||||
use tracing_subscriber::{EnvFilter, fmt};
|
||||
|
||||
use crate::state::AppState;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(
|
||||
name = "webclaw-server",
|
||||
version,
|
||||
about = "Minimal self-hosted REST API for webclaw extraction.",
|
||||
long_about = "Stateless single-binary REST API. Wraps the OSS extraction \
|
||||
crates over HTTP. For the full hosted platform (anti-bot, \
|
||||
JS render, async jobs, multi-tenant), use api.webclaw.io."
|
||||
)]
|
||||
struct Args {
|
||||
/// Port to listen on. Env: WEBCLAW_PORT.
|
||||
#[arg(short, long, env = "WEBCLAW_PORT", default_value_t = 3000)]
|
||||
port: u16,
|
||||
|
||||
/// Host to bind to. Env: WEBCLAW_HOST.
|
||||
/// Default `127.0.0.1` keeps the server local-only; set to
|
||||
/// `0.0.0.0` to expose on all interfaces (only do this with
|
||||
/// `--api-key` set or behind a reverse proxy that adds auth).
|
||||
#[arg(long, env = "WEBCLAW_HOST", default_value = "127.0.0.1")]
|
||||
host: IpAddr,
|
||||
|
||||
/// Optional bearer token. Env: WEBCLAW_API_KEY. When set, every
|
||||
/// `/v1/*` request must present `Authorization: Bearer <key>`.
|
||||
/// When unset, the server runs in open mode (no auth) — only
|
||||
/// safe on a local-bound interface or behind another auth layer.
|
||||
#[arg(long, env = "WEBCLAW_API_KEY")]
|
||||
api_key: Option<String>,
|
||||
|
||||
/// Tracing filter. Env: RUST_LOG.
|
||||
#[arg(long, env = "RUST_LOG", default_value = "info,webclaw_server=info")]
|
||||
log: String,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
fmt()
|
||||
.with_env_filter(EnvFilter::try_new(&args.log).unwrap_or_else(|_| EnvFilter::new("info")))
|
||||
.with_target(false)
|
||||
.compact()
|
||||
.init();
|
||||
|
||||
let state = AppState::new(args.api_key.clone())?;
|
||||
|
||||
let v1 = Router::new()
|
||||
.route("/scrape", post(routes::scrape::scrape))
|
||||
.route("/crawl", post(routes::crawl::crawl))
|
||||
.route("/map", post(routes::map::map))
|
||||
.route("/batch", post(routes::batch::batch))
|
||||
.route("/extract", post(routes::extract::extract))
|
||||
.route("/summarize", post(routes::summarize::summarize_route))
|
||||
.route("/diff", post(routes::diff::diff_route))
|
||||
.route("/brand", post(routes::brand::brand))
|
||||
.layer(from_fn_with_state(state.clone(), auth::require_bearer));
|
||||
|
||||
let app = Router::new()
|
||||
.route("/health", get(routes::health::health))
|
||||
.nest("/v1", v1)
|
||||
.layer(
|
||||
// Permissive CORS — same posture as a self-hosted dev tool.
|
||||
// Tighten in front with a reverse proxy if you expose this
|
||||
// publicly.
|
||||
CorsLayer::new()
|
||||
.allow_origin(Any)
|
||||
.allow_methods(Any)
|
||||
.allow_headers(Any)
|
||||
.max_age(Duration::from_secs(3600)),
|
||||
)
|
||||
.layer(TraceLayer::new_for_http())
|
||||
.with_state(state);
|
||||
|
||||
let addr = SocketAddr::from((args.host, args.port));
|
||||
let listener = tokio::net::TcpListener::bind(addr).await?;
|
||||
let auth_status = if args.api_key.is_some() {
|
||||
"bearer auth required"
|
||||
} else {
|
||||
"open mode (no auth)"
|
||||
};
|
||||
info!(%addr, mode = auth_status, "webclaw-server listening");
|
||||
|
||||
axum::serve(listener, app).await?;
|
||||
Ok(())
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue