webclaw/crates/webclaw-server/src/routes/diff.rs

//! POST /v1/diff — compare current page content against a prior snapshot.
//!
//! Caller passes either a full prior `ExtractionResult` or the minimal
//! `{ markdown, metadata }` shape used by the hosted API. We re-fetch
//! the URL, extract, and run `webclaw_core::diff::diff` over the pair.

use axum::{Json, extract::State};
use serde::Deserialize;
use serde_json::{Value, json};
use webclaw_core::{Content, ExtractionResult, Metadata, diff::diff};

use crate::{error::ApiError, state::AppState};

#[derive(Debug, Deserialize)]
pub struct DiffRequest {
    pub url: String,
    pub previous: PreviousSnapshot,
}

/// Either a full prior extraction, or the minimal `{ markdown, metadata }`
/// shape returned by /v1/scrape. Untagged so callers can send whichever
/// they have on hand.
#[derive(Debug, Deserialize)]
#[serde(untagged)]
pub enum PreviousSnapshot {
    Full(ExtractionResult),
    Minimal {
        #[serde(default)]
        markdown: String,
        #[serde(default)]
        metadata: Option<Metadata>,
    },
}

impl PreviousSnapshot {
    fn into_extraction(self) -> ExtractionResult {
        match self {
            Self::Full(r) => r,
            Self::Minimal { markdown, metadata } => ExtractionResult {
                metadata: metadata.unwrap_or_else(empty_metadata),
                content: Content {
                    markdown,
                    plain_text: String::new(),
                    links: Vec::new(),
                    images: Vec::new(),
                    code_blocks: Vec::new(),
                    raw_html: None,
                },
                domain_data: None,
                structured_data: Vec::new(),
            },
        }
    }
}

fn empty_metadata() -> Metadata {
    Metadata {
        title: None,
        description: None,
        author: None,
        published_date: None,
        language: None,
        url: None,
        site_name: None,
        image: None,
        favicon: None,
        word_count: 0,
    }
}

pub async fn diff_route(
    State(state): State<AppState>,
    Json(req): Json<DiffRequest>,
) -> Result<Json<Value>, ApiError> {
    if req.url.trim().is_empty() {
        return Err(ApiError::bad_request("`url` is required"));
    }
    let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;

    let current = state.fetch().fetch_and_extract(url.as_str()).await?;
    let previous = req.previous.into_extraction();
    let result = diff(&previous, &current);

    Ok(Json(json!({
        "url": req.url,
        "status": result.status,
        "diff": result.text_diff,
        "metadata_changes": result.metadata_changes,
        "links_added": result.links_added,
        "links_removed": result.links_removed,
        "word_count_delta": result.word_count_delta,
    })))
}
feat(server): add OSS webclaw-server REST API binary (closes #29) Self-hosters hitting docs/self-hosting were promised three binaries but the OSS Docker image only shipped two. webclaw-server lived in the closed-source hosted-platform repo, which couldn't be opened. This adds a minimal axum REST API in the OSS repo so self-hosting actually works without pretending to ship the cloud platform. Crate at crates/webclaw-server/. Stateless, no database, no job queue, single binary. Endpoints: GET /health, POST /v1/{scrape, crawl, map, batch, extract, summarize, diff, brand}. JSON shapes mirror api.webclaw.io for the endpoints OSS can support, so swapping between self-hosted and hosted is a base-URL change. Auth: optional bearer token via WEBCLAW_API_KEY / --api-key. Comparison is constant-time (subtle::ConstantTimeEq). Open mode (no key) is allowed and binds 127.0.0.1 by default; the Docker image flips WEBCLAW_HOST=0.0.0.0 so the container is reachable out of the box. Hard caps to keep naive callers from OOMing the process: crawl capped at 500 pages synchronously, batch capped at 100 URLs / 20 concurrent. For unbounded crawls or anti-bot bypass the docs point users at the hosted API. Dockerfile + Dockerfile.ci updated to copy webclaw-server into /usr/local/bin and EXPOSE 3000. Workspace version bumped to 0.4.0 (new public binary). 2026-04-22 12:25:11 +02:00			`//! POST /v1/diff — compare current page content against a prior snapshot.`
			`//!`
			//! Caller passes either a full prior `ExtractionResult` or the minimal
			//! `{ markdown, metadata }` shape used by the hosted API. We re-fetch
			//! the URL, extract, and run `webclaw_core::diff::diff` over the pair.

			`use axum::{Json, extract::State};`
			`use serde::Deserialize;`
			`use serde_json::{Value, json};`
			`use webclaw_core::{Content, ExtractionResult, Metadata, diff::diff};`

			`use crate::{error::ApiError, state::AppState};`

			`#[derive(Debug, Deserialize)]`
			`pub struct DiffRequest {`
			`pub url: String,`
			`pub previous: PreviousSnapshot,`
			`}`

			/// Either a full prior extraction, or the minimal `{ markdown, metadata }`
			`/// shape returned by /v1/scrape. Untagged so callers can send whichever`
			`/// they have on hand.`
			`#[derive(Debug, Deserialize)]`
			`#[serde(untagged)]`
			`pub enum PreviousSnapshot {`
			`Full(ExtractionResult),`
			`Minimal {`
			`#[serde(default)]`
			`markdown: String,`
			`#[serde(default)]`
			`metadata: Option<Metadata>,`
			`},`
			`}`

			`impl PreviousSnapshot {`
			`fn into_extraction(self) -> ExtractionResult {`
			`match self {`
			`Self::Full(r) => r,`
			`Self::Minimal { markdown, metadata } => ExtractionResult {`
			`metadata: metadata.unwrap_or_else(empty_metadata),`
			`content: Content {`
			`markdown,`
			`plain_text: String::new(),`
			`links: Vec::new(),`
			`images: Vec::new(),`
			`code_blocks: Vec::new(),`
			`raw_html: None,`
			`},`
			`domain_data: None,`
			`structured_data: Vec::new(),`
			`},`
			`}`
			`}`
			`}`

			`fn empty_metadata() -> Metadata {`
			`Metadata {`
			`title: None,`
			`description: None,`
			`author: None,`
			`published_date: None,`
			`language: None,`
			`url: None,`
			`site_name: None,`
			`image: None,`
			`favicon: None,`
			`word_count: 0,`
			`}`
			`}`

			`pub async fn diff_route(`
			`State(state): State<AppState>,`
			`Json(req): Json<DiffRequest>,`
			`) -> Result<Json<Value>, ApiError> {`
			`if req.url.trim().is_empty() {`
			return Err(ApiError::bad_request("`url` is required"));
			`}`
fix: validate self-host route URLs consistently 2026-05-04 14:30:06 +02:00			`let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;`
feat(server): add OSS webclaw-server REST API binary (closes #29) Self-hosters hitting docs/self-hosting were promised three binaries but the OSS Docker image only shipped two. webclaw-server lived in the closed-source hosted-platform repo, which couldn't be opened. This adds a minimal axum REST API in the OSS repo so self-hosting actually works without pretending to ship the cloud platform. Crate at crates/webclaw-server/. Stateless, no database, no job queue, single binary. Endpoints: GET /health, POST /v1/{scrape, crawl, map, batch, extract, summarize, diff, brand}. JSON shapes mirror api.webclaw.io for the endpoints OSS can support, so swapping between self-hosted and hosted is a base-URL change. Auth: optional bearer token via WEBCLAW_API_KEY / --api-key. Comparison is constant-time (subtle::ConstantTimeEq). Open mode (no key) is allowed and binds 127.0.0.1 by default; the Docker image flips WEBCLAW_HOST=0.0.0.0 so the container is reachable out of the box. Hard caps to keep naive callers from OOMing the process: crawl capped at 500 pages synchronously, batch capped at 100 URLs / 20 concurrent. For unbounded crawls or anti-bot bypass the docs point users at the hosted API. Dockerfile + Dockerfile.ci updated to copy webclaw-server into /usr/local/bin and EXPOSE 3000. Workspace version bumped to 0.4.0 (new public binary). 2026-04-22 12:25:11 +02:00
fix: validate self-host route URLs consistently 2026-05-04 14:30:06 +02:00			`let current = state.fetch().fetch_and_extract(url.as_str()).await?;`
feat(server): add OSS webclaw-server REST API binary (closes #29) Self-hosters hitting docs/self-hosting were promised three binaries but the OSS Docker image only shipped two. webclaw-server lived in the closed-source hosted-platform repo, which couldn't be opened. This adds a minimal axum REST API in the OSS repo so self-hosting actually works without pretending to ship the cloud platform. Crate at crates/webclaw-server/. Stateless, no database, no job queue, single binary. Endpoints: GET /health, POST /v1/{scrape, crawl, map, batch, extract, summarize, diff, brand}. JSON shapes mirror api.webclaw.io for the endpoints OSS can support, so swapping between self-hosted and hosted is a base-URL change. Auth: optional bearer token via WEBCLAW_API_KEY / --api-key. Comparison is constant-time (subtle::ConstantTimeEq). Open mode (no key) is allowed and binds 127.0.0.1 by default; the Docker image flips WEBCLAW_HOST=0.0.0.0 so the container is reachable out of the box. Hard caps to keep naive callers from OOMing the process: crawl capped at 500 pages synchronously, batch capped at 100 URLs / 20 concurrent. For unbounded crawls or anti-bot bypass the docs point users at the hosted API. Dockerfile + Dockerfile.ci updated to copy webclaw-server into /usr/local/bin and EXPOSE 3000. Workspace version bumped to 0.4.0 (new public binary). 2026-04-22 12:25:11 +02:00			`let previous = req.previous.into_extraction();`
			`let result = diff(&previous, &current);`

			`Ok(Json(json!({`
			`"url": req.url,`
			`"status": result.status,`
			`"diff": result.text_diff,`
			`"metadata_changes": result.metadata_changes,`
			`"links_added": result.links_added,`
			`"links_removed": result.links_removed,`
			`"word_count_delta": result.word_count_delta,`
			`})))`
			`}`