webclaw/crates/webclaw-server/src/routes/structured.rs

//! `POST /v1/scrape/{vertical}` and `GET /v1/extractors`.
//!
//! Vertical extractors return typed JSON instead of generic markdown.
//! See `webclaw_fetch::extractors` for the catalog and per-site logic.

use axum::{
    Json,
    extract::{Path, State},
};
use serde::Deserialize;
use serde_json::{Value, json};
use webclaw_fetch::extractors::{self, ExtractorDispatchError};

use crate::{error::ApiError, state::AppState};

#[derive(Debug, Deserialize)]
pub struct ScrapeRequest {
    pub url: String,
}

/// Map dispatcher errors to ApiError so users get clean HTTP statuses
/// instead of opaque 500s.
impl From<ExtractorDispatchError> for ApiError {
    fn from(e: ExtractorDispatchError) -> Self {
        match e {
            ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
            ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
            ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()),
        }
    }
}

/// `GET /v1/extractors` — catalog of all available verticals.
pub async fn list_extractors() -> Json<Value> {
    Json(json!({
        "extractors": extractors::list(),
    }))
}

/// `POST /v1/scrape/{vertical}` — explicit vertical, e.g. /v1/scrape/reddit.
pub async fn scrape_vertical(
    State(state): State<AppState>,
    Path(vertical): Path<String>,
    Json(req): Json<ScrapeRequest>,
) -> Result<Json<Value>, ApiError> {
    if req.url.trim().is_empty() {
        return Err(ApiError::bad_request("`url` is required"));
    }
    let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?;
    Ok(Json(json!({
        "vertical": vertical,
        "url": req.url,
        "data": data,
    })))
}
feat(extractors): add vertical extractors module + first 6 verticals New extractors module returns site-specific typed JSON instead of generic markdown. Each extractor: - declares a URL pattern via matches() - fetches from the site's official JSON API where one exists - returns a typed serde_json::Value with documented field names - exposes an INFO struct that powers the /v1/extractors catalog First 6 verticals shipped, all hitting public JSON APIs (no HTML scraping, zero antibot risk): - reddit → www.reddit.com/*/.json - hackernews → hn.algolia.com/api/v1/items/{id} (full thread in one call) - github_repo → api.github.com/repos/{owner}/{repo} - pypi → pypi.org/pypi/{name}/json - npm → registry.npmjs.org/{name} + downloads/point/last-week - huggingface_model → huggingface.co/api/models/{owner}/{name} Server-side routes added: - POST /v1/scrape/{vertical} explicit per-vertical extraction - GET /v1/extractors catalog (name, label, description, url_patterns) The dispatcher validates that URL matches the requested vertical before running, so users get "URL doesn't match the X extractor" instead of opaque parse failures inside the extractor. 17 unit tests cover URL matching + path parsing for each vertical. Live tests against canonical URLs (rust-lang/rust, requests pypi, react npm, whisper-large-v3 hf, item 8863 hn, an r/micro_saas post) all return correct typed JSON in 100-300ms. Sample sizes: github 863B, npm 700B, pypi 1.7KB, hf 3.2KB, hn 38KB (full comment tree). Marketing positioning: Firecrawl charges 5 credits per /extract call and you write the schema. Webclaw returns the same JSON in 1 credit per /scrape/{vertical} call with hand-written deterministic extractors per site. 2026-04-22 14:11:43 +02:00			//! `POST /v1/scrape/{vertical}` and `GET /v1/extractors`.
			`//!`
			`//! Vertical extractors return typed JSON instead of generic markdown.`
			//! See `webclaw_fetch::extractors` for the catalog and per-site logic.

			`use axum::{`
			`Json,`
			`extract::{Path, State},`
			`};`
			`use serde::Deserialize;`
			`use serde_json::{Value, json};`
			`use webclaw_fetch::extractors::{self, ExtractorDispatchError};`

			`use crate::{error::ApiError, state::AppState};`

			`#[derive(Debug, Deserialize)]`
			`pub struct ScrapeRequest {`
			`pub url: String,`
			`}`

			`/// Map dispatcher errors to ApiError so users get clean HTTP statuses`
			`/// instead of opaque 500s.`
			`impl From<ExtractorDispatchError> for ApiError {`
			`fn from(e: ExtractorDispatchError) -> Self {`
			`match e {`
			`ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,`
			`ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),`
			`ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()),`
			`}`
			`}`
			`}`

			/// `GET /v1/extractors` — catalog of all available verticals.
			`pub async fn list_extractors() -> Json<Value> {`
			`Json(json!({`
			`"extractors": extractors::list(),`
			`}))`
			`}`

			/// `POST /v1/scrape/{vertical}` — explicit vertical, e.g. /v1/scrape/reddit.
			`pub async fn scrape_vertical(`
			`State(state): State<AppState>,`
			`Path(vertical): Path<String>,`
			`Json(req): Json<ScrapeRequest>,`
			`) -> Result<Json<Value>, ApiError> {`
			`if req.url.trim().is_empty() {`
			return Err(ApiError::bad_request("`url` is required"));
			`}`
			`let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?;`
			`Ok(Json(json!({`
			`"vertical": vertical,`
			`"url": req.url,`
			`"data": data,`
			`})))`
			`}`