mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-05-29 20:45:12 +02:00
56 lines
1.7 KiB
Rust
56 lines
1.7 KiB
Rust
|
|
//! `POST /v1/scrape/{vertical}` and `GET /v1/extractors`.
|
||
|
|
//!
|
||
|
|
//! Vertical extractors return typed JSON instead of generic markdown.
|
||
|
|
//! See `webclaw_fetch::extractors` for the catalog and per-site logic.
|
||
|
|
|
||
|
|
use axum::{
|
||
|
|
Json,
|
||
|
|
extract::{Path, State},
|
||
|
|
};
|
||
|
|
use serde::Deserialize;
|
||
|
|
use serde_json::{Value, json};
|
||
|
|
use webclaw_fetch::extractors::{self, ExtractorDispatchError};
|
||
|
|
|
||
|
|
use crate::{error::ApiError, state::AppState};
|
||
|
|
|
||
|
|
#[derive(Debug, Deserialize)]
|
||
|
|
pub struct ScrapeRequest {
|
||
|
|
pub url: String,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Map dispatcher errors to ApiError so users get clean HTTP statuses
|
||
|
|
/// instead of opaque 500s.
|
||
|
|
impl From<ExtractorDispatchError> for ApiError {
|
||
|
|
fn from(e: ExtractorDispatchError) -> Self {
|
||
|
|
match e {
|
||
|
|
ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
|
||
|
|
ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
|
||
|
|
ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()),
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// `GET /v1/extractors` — catalog of all available verticals.
|
||
|
|
pub async fn list_extractors() -> Json<Value> {
|
||
|
|
Json(json!({
|
||
|
|
"extractors": extractors::list(),
|
||
|
|
}))
|
||
|
|
}
|
||
|
|
|
||
|
|
/// `POST /v1/scrape/{vertical}` — explicit vertical, e.g. /v1/scrape/reddit.
|
||
|
|
pub async fn scrape_vertical(
|
||
|
|
State(state): State<AppState>,
|
||
|
|
Path(vertical): Path<String>,
|
||
|
|
Json(req): Json<ScrapeRequest>,
|
||
|
|
) -> Result<Json<Value>, ApiError> {
|
||
|
|
if req.url.trim().is_empty() {
|
||
|
|
return Err(ApiError::bad_request("`url` is required"));
|
||
|
|
}
|
||
|
|
let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?;
|
||
|
|
Ok(Json(json!({
|
||
|
|
"vertical": vertical,
|
||
|
|
"url": req.url,
|
||
|
|
"data": data,
|
||
|
|
})))
|
||
|
|
}
|