mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-25 03:08:06 +02:00
feat(server): add OSS webclaw-server REST API binary (closes #29)
Self-hosters hitting docs/self-hosting were promised three binaries
but the OSS Docker image only shipped two. webclaw-server lived in
the closed-source hosted-platform repo, which couldn't be opened. This
adds a minimal axum REST API in the OSS repo so self-hosting actually
works without pretending to ship the cloud platform.
Crate at crates/webclaw-server/. Stateless, no database, no job queue,
single binary. Endpoints: GET /health, POST /v1/{scrape, crawl, map,
batch, extract, summarize, diff, brand}. JSON shapes mirror
api.webclaw.io for the endpoints OSS can support, so swapping between
self-hosted and hosted is a base-URL change.
Auth: optional bearer token via WEBCLAW_API_KEY / --api-key. Comparison
is constant-time (subtle::ConstantTimeEq). Open mode (no key) is
allowed and binds 127.0.0.1 by default; the Docker image flips
WEBCLAW_HOST=0.0.0.0 so the container is reachable out of the box.
Hard caps to keep naive callers from OOMing the process: crawl capped
at 500 pages synchronously, batch capped at 100 URLs / 20 concurrent.
For unbounded crawls or anti-bot bypass the docs point users at the
hosted API.
Dockerfile + Dockerfile.ci updated to copy webclaw-server into
/usr/local/bin and EXPOSE 3000. Workspace version bumped to 0.4.0
(new public binary).
This commit is contained in:
parent
b4bfff120e
commit
2ba682adf3
20 changed files with 1116 additions and 11 deletions
85
crates/webclaw-server/src/routes/batch.rs
Normal file
85
crates/webclaw-server/src/routes/batch.rs
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
//! POST /v1/batch — fetch + extract many URLs in parallel.
|
||||
//!
|
||||
//! `concurrency` is hard-capped at 20 to avoid hammering targets and
|
||||
//! to bound memory growth for naive callers. For larger batches use
|
||||
//! the hosted API.
|
||||
|
||||
use axum::{Json, extract::State};
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
use webclaw_core::ExtractionOptions;
|
||||
|
||||
use crate::{error::ApiError, state::AppState};
|
||||
|
||||
const HARD_MAX_URLS: usize = 100;
|
||||
const HARD_MAX_CONCURRENCY: usize = 20;
|
||||
|
||||
#[derive(Debug, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct BatchRequest {
|
||||
pub urls: Vec<String>,
|
||||
pub concurrency: Option<usize>,
|
||||
pub include_selectors: Vec<String>,
|
||||
pub exclude_selectors: Vec<String>,
|
||||
pub only_main_content: bool,
|
||||
}
|
||||
|
||||
pub async fn batch(
|
||||
State(state): State<AppState>,
|
||||
Json(req): Json<BatchRequest>,
|
||||
) -> Result<Json<Value>, ApiError> {
|
||||
if req.urls.is_empty() {
|
||||
return Err(ApiError::bad_request("`urls` is required"));
|
||||
}
|
||||
if req.urls.len() > HARD_MAX_URLS {
|
||||
return Err(ApiError::bad_request(format!(
|
||||
"too many urls: {} (max {HARD_MAX_URLS})",
|
||||
req.urls.len()
|
||||
)));
|
||||
}
|
||||
|
||||
let concurrency = req.concurrency.unwrap_or(5).clamp(1, HARD_MAX_CONCURRENCY);
|
||||
|
||||
let options = ExtractionOptions {
|
||||
include_selectors: req.include_selectors,
|
||||
exclude_selectors: req.exclude_selectors,
|
||||
only_main_content: req.only_main_content,
|
||||
include_raw_html: false,
|
||||
};
|
||||
|
||||
let url_refs: Vec<&str> = req.urls.iter().map(|s| s.as_str()).collect();
|
||||
let results = state
|
||||
.fetch()
|
||||
.fetch_and_extract_batch_with_options(&url_refs, concurrency, &options)
|
||||
.await;
|
||||
|
||||
let mut ok = 0usize;
|
||||
let mut errors = 0usize;
|
||||
let mut out: Vec<Value> = Vec::with_capacity(results.len());
|
||||
for r in results {
|
||||
match r.result {
|
||||
Ok(extraction) => {
|
||||
ok += 1;
|
||||
out.push(json!({
|
||||
"url": r.url,
|
||||
"metadata": extraction.metadata,
|
||||
"markdown": extraction.content.markdown,
|
||||
}));
|
||||
}
|
||||
Err(e) => {
|
||||
errors += 1;
|
||||
out.push(json!({
|
||||
"url": r.url,
|
||||
"error": e.to_string(),
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Json(json!({
|
||||
"total": out.len(),
|
||||
"completed": ok,
|
||||
"errors": errors,
|
||||
"results": out,
|
||||
})))
|
||||
}
|
||||
32
crates/webclaw-server/src/routes/brand.rs
Normal file
32
crates/webclaw-server/src/routes/brand.rs
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
//! POST /v1/brand — extract brand identity (colors, fonts, logo) from a page.
|
||||
//!
|
||||
//! Pure DOM/CSS analysis — no LLM, no network beyond the page fetch itself.
|
||||
|
||||
use axum::{Json, extract::State};
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
use webclaw_core::brand::extract_brand;
|
||||
|
||||
use crate::{error::ApiError, state::AppState};
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct BrandRequest {
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
pub async fn brand(
|
||||
State(state): State<AppState>,
|
||||
Json(req): Json<BrandRequest>,
|
||||
) -> Result<Json<Value>, ApiError> {
|
||||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
|
||||
let fetched = state.fetch().fetch(&req.url).await?;
|
||||
let brand = extract_brand(&fetched.html, Some(&fetched.url));
|
||||
|
||||
Ok(Json(json!({
|
||||
"url": req.url,
|
||||
"brand": brand,
|
||||
})))
|
||||
}
|
||||
85
crates/webclaw-server/src/routes/crawl.rs
Normal file
85
crates/webclaw-server/src/routes/crawl.rs
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
//! POST /v1/crawl — synchronous BFS crawl.
|
||||
//!
|
||||
//! NOTE: this server is stateless — there is no job queue. Crawls run
|
||||
//! inline and return when complete. `max_pages` is hard-capped at 500
|
||||
//! to avoid OOM on naive callers. For large crawls + async jobs, use
|
||||
//! the hosted API at api.webclaw.io.
|
||||
|
||||
use axum::{Json, extract::State};
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
use std::time::Duration;
|
||||
use webclaw_fetch::{CrawlConfig, Crawler, FetchConfig};
|
||||
|
||||
use crate::{error::ApiError, state::AppState};
|
||||
|
||||
const HARD_MAX_PAGES: usize = 500;
|
||||
|
||||
#[derive(Debug, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct CrawlRequest {
|
||||
pub url: String,
|
||||
pub max_depth: Option<usize>,
|
||||
pub max_pages: Option<usize>,
|
||||
pub use_sitemap: bool,
|
||||
pub concurrency: Option<usize>,
|
||||
pub allow_subdomains: bool,
|
||||
pub allow_external_links: bool,
|
||||
pub include_patterns: Vec<String>,
|
||||
pub exclude_patterns: Vec<String>,
|
||||
}
|
||||
|
||||
pub async fn crawl(
|
||||
State(_state): State<AppState>,
|
||||
Json(req): Json<CrawlRequest>,
|
||||
) -> Result<Json<Value>, ApiError> {
|
||||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
let max_pages = req.max_pages.unwrap_or(50).min(HARD_MAX_PAGES);
|
||||
let max_depth = req.max_depth.unwrap_or(3);
|
||||
let concurrency = req.concurrency.unwrap_or(5).min(20);
|
||||
|
||||
let config = CrawlConfig {
|
||||
fetch: FetchConfig::default(),
|
||||
max_depth,
|
||||
max_pages,
|
||||
concurrency,
|
||||
delay: Duration::from_millis(200),
|
||||
path_prefix: None,
|
||||
use_sitemap: req.use_sitemap,
|
||||
include_patterns: req.include_patterns,
|
||||
exclude_patterns: req.exclude_patterns,
|
||||
allow_subdomains: req.allow_subdomains,
|
||||
allow_external_links: req.allow_external_links,
|
||||
progress_tx: None,
|
||||
cancel_flag: None,
|
||||
};
|
||||
|
||||
let crawler = Crawler::new(&req.url, config).map_err(ApiError::from)?;
|
||||
let result = crawler.crawl(&req.url, None).await;
|
||||
|
||||
let pages: Vec<Value> = result
|
||||
.pages
|
||||
.iter()
|
||||
.map(|p| {
|
||||
json!({
|
||||
"url": p.url,
|
||||
"depth": p.depth,
|
||||
"metadata": p.extraction.as_ref().map(|e| &e.metadata),
|
||||
"markdown": p.extraction.as_ref().map(|e| e.content.markdown.as_str()).unwrap_or(""),
|
||||
"error": p.error,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(Json(json!({
|
||||
"url": req.url,
|
||||
"status": "completed",
|
||||
"total": result.total,
|
||||
"completed": result.ok,
|
||||
"errors": result.errors,
|
||||
"elapsed_secs": result.elapsed_secs,
|
||||
"pages": pages,
|
||||
})))
|
||||
}
|
||||
92
crates/webclaw-server/src/routes/diff.rs
Normal file
92
crates/webclaw-server/src/routes/diff.rs
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
//! POST /v1/diff — compare current page content against a prior snapshot.
|
||||
//!
|
||||
//! Caller passes either a full prior `ExtractionResult` or the minimal
|
||||
//! `{ markdown, metadata }` shape used by the hosted API. We re-fetch
|
||||
//! the URL, extract, and run `webclaw_core::diff::diff` over the pair.
|
||||
|
||||
use axum::{Json, extract::State};
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
use webclaw_core::{Content, ExtractionResult, Metadata, diff::diff};
|
||||
|
||||
use crate::{error::ApiError, state::AppState};
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct DiffRequest {
|
||||
pub url: String,
|
||||
pub previous: PreviousSnapshot,
|
||||
}
|
||||
|
||||
/// Either a full prior extraction, or the minimal `{ markdown, metadata }`
|
||||
/// shape returned by /v1/scrape. Untagged so callers can send whichever
|
||||
/// they have on hand.
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum PreviousSnapshot {
|
||||
Full(ExtractionResult),
|
||||
Minimal {
|
||||
#[serde(default)]
|
||||
markdown: String,
|
||||
#[serde(default)]
|
||||
metadata: Option<Metadata>,
|
||||
},
|
||||
}
|
||||
|
||||
impl PreviousSnapshot {
|
||||
fn into_extraction(self) -> ExtractionResult {
|
||||
match self {
|
||||
Self::Full(r) => r,
|
||||
Self::Minimal { markdown, metadata } => ExtractionResult {
|
||||
metadata: metadata.unwrap_or_else(empty_metadata),
|
||||
content: Content {
|
||||
markdown,
|
||||
plain_text: String::new(),
|
||||
links: Vec::new(),
|
||||
images: Vec::new(),
|
||||
code_blocks: Vec::new(),
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: Vec::new(),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn empty_metadata() -> Metadata {
|
||||
Metadata {
|
||||
title: None,
|
||||
description: None,
|
||||
author: None,
|
||||
published_date: None,
|
||||
language: None,
|
||||
url: None,
|
||||
site_name: None,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn diff_route(
|
||||
State(state): State<AppState>,
|
||||
Json(req): Json<DiffRequest>,
|
||||
) -> Result<Json<Value>, ApiError> {
|
||||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
|
||||
let current = state.fetch().fetch_and_extract(&req.url).await?;
|
||||
let previous = req.previous.into_extraction();
|
||||
let result = diff(&previous, ¤t);
|
||||
|
||||
Ok(Json(json!({
|
||||
"url": req.url,
|
||||
"status": result.status,
|
||||
"diff": result.text_diff,
|
||||
"metadata_changes": result.metadata_changes,
|
||||
"links_added": result.links_added,
|
||||
"links_removed": result.links_removed,
|
||||
"word_count_delta": result.word_count_delta,
|
||||
})))
|
||||
}
|
||||
81
crates/webclaw-server/src/routes/extract.rs
Normal file
81
crates/webclaw-server/src/routes/extract.rs
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
//! POST /v1/extract — LLM-powered structured extraction.
|
||||
//!
|
||||
//! Two modes:
|
||||
//! * `schema` — JSON Schema describing what to extract.
|
||||
//! * `prompt` — natural-language instructions.
|
||||
//!
|
||||
//! At least one must be provided. The provider chain is built per
|
||||
//! request from env (Ollama -> OpenAI -> Anthropic). Self-hosters
|
||||
//! get the same fallback behaviour as the CLI.
|
||||
|
||||
use axum::{Json, extract::State};
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
use webclaw_llm::{ProviderChain, extract::extract_json, extract::extract_with_prompt};
|
||||
|
||||
use crate::{error::ApiError, state::AppState};
|
||||
|
||||
#[derive(Debug, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct ExtractRequest {
|
||||
pub url: String,
|
||||
pub schema: Option<Value>,
|
||||
pub prompt: Option<String>,
|
||||
/// Optional override of the provider model name (e.g. `gpt-4o-mini`).
|
||||
pub model: Option<String>,
|
||||
}
|
||||
|
||||
pub async fn extract(
|
||||
State(state): State<AppState>,
|
||||
Json(req): Json<ExtractRequest>,
|
||||
) -> Result<Json<Value>, ApiError> {
|
||||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
let has_schema = req.schema.is_some();
|
||||
let has_prompt = req
|
||||
.prompt
|
||||
.as_deref()
|
||||
.map(|p| !p.trim().is_empty())
|
||||
.unwrap_or(false);
|
||||
if !has_schema && !has_prompt {
|
||||
return Err(ApiError::bad_request(
|
||||
"either `schema` or `prompt` is required",
|
||||
));
|
||||
}
|
||||
|
||||
// Fetch + extract first so we feed the LLM clean markdown instead of
|
||||
// raw HTML. Cheaper tokens, better signal.
|
||||
let extraction = state.fetch().fetch_and_extract(&req.url).await?;
|
||||
let content = if extraction.content.markdown.trim().is_empty() {
|
||||
extraction.content.plain_text.clone()
|
||||
} else {
|
||||
extraction.content.markdown.clone()
|
||||
};
|
||||
if content.trim().is_empty() {
|
||||
return Err(ApiError::Extract(
|
||||
"no extractable content on page".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let chain = ProviderChain::default().await;
|
||||
if chain.is_empty() {
|
||||
return Err(ApiError::Llm(
|
||||
"no LLM providers configured (set OLLAMA_HOST, OPENAI_API_KEY, or ANTHROPIC_API_KEY)"
|
||||
.to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let model = req.model.as_deref();
|
||||
let data = if let Some(schema) = req.schema.as_ref() {
|
||||
extract_json(&content, schema, &chain, model).await?
|
||||
} else {
|
||||
let prompt = req.prompt.as_deref().unwrap_or_default();
|
||||
extract_with_prompt(&content, prompt, &chain, model).await?
|
||||
};
|
||||
|
||||
Ok(Json(json!({
|
||||
"url": req.url,
|
||||
"data": data,
|
||||
})))
|
||||
}
|
||||
10
crates/webclaw-server/src/routes/health.rs
Normal file
10
crates/webclaw-server/src/routes/health.rs
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
use axum::Json;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
pub async fn health() -> Json<Value> {
|
||||
Json(json!({
|
||||
"status": "ok",
|
||||
"version": env!("CARGO_PKG_VERSION"),
|
||||
"service": "webclaw-server",
|
||||
}))
|
||||
}
|
||||
49
crates/webclaw-server/src/routes/map.rs
Normal file
49
crates/webclaw-server/src/routes/map.rs
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
//! POST /v1/map — discover URLs from a site's sitemaps.
|
||||
//!
|
||||
//! Walks robots.txt + common sitemap paths, recursively resolves
|
||||
//! `<sitemapindex>` files, and returns the deduplicated list of URLs.
|
||||
|
||||
use axum::{Json, extract::State};
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
use webclaw_fetch::sitemap;
|
||||
|
||||
use crate::{error::ApiError, state::AppState};
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct MapRequest {
|
||||
pub url: String,
|
||||
/// When true, return the full SitemapEntry objects (with lastmod,
|
||||
/// priority, changefreq). Defaults to false → bare URL strings,
|
||||
/// matching the hosted-API shape.
|
||||
#[serde(default)]
|
||||
pub include_metadata: bool,
|
||||
}
|
||||
|
||||
pub async fn map(
|
||||
State(state): State<AppState>,
|
||||
Json(req): Json<MapRequest>,
|
||||
) -> Result<Json<Value>, ApiError> {
|
||||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
|
||||
let entries = sitemap::discover(state.fetch(), &req.url).await?;
|
||||
|
||||
let body = if req.include_metadata {
|
||||
json!({
|
||||
"url": req.url,
|
||||
"count": entries.len(),
|
||||
"urls": entries,
|
||||
})
|
||||
} else {
|
||||
let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect();
|
||||
json!({
|
||||
"url": req.url,
|
||||
"count": urls.len(),
|
||||
"urls": urls,
|
||||
})
|
||||
};
|
||||
|
||||
Ok(Json(body))
|
||||
}
|
||||
18
crates/webclaw-server/src/routes/mod.rs
Normal file
18
crates/webclaw-server/src/routes/mod.rs
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
//! HTTP route handlers.
|
||||
//!
|
||||
//! The OSS server exposes a deliberately small surface that mirrors the
|
||||
//! hosted-API JSON shapes where the underlying capability exists in the
|
||||
//! OSS crates. Endpoints that depend on private infrastructure
|
||||
//! (anti-bot bypass with stealth Chrome, JS rendering at scale,
|
||||
//! per-user auth, billing, async job queues, agent loops) are
|
||||
//! intentionally not implemented here. Use api.webclaw.io for those.
|
||||
|
||||
pub mod batch;
|
||||
pub mod brand;
|
||||
pub mod crawl;
|
||||
pub mod diff;
|
||||
pub mod extract;
|
||||
pub mod health;
|
||||
pub mod map;
|
||||
pub mod scrape;
|
||||
pub mod summarize;
|
||||
108
crates/webclaw-server/src/routes/scrape.rs
Normal file
108
crates/webclaw-server/src/routes/scrape.rs
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
//! POST /v1/scrape — fetch a URL, run extraction, return the requested
|
||||
//! formats. JSON shape mirrors the hosted-API response where possible so
|
||||
//! migrating from self-hosted → cloud is a config change, not a code one.
|
||||
|
||||
use axum::{Json, extract::State};
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
use webclaw_core::{ExtractionOptions, llm::to_llm_text};
|
||||
|
||||
use crate::{error::ApiError, state::AppState};
|
||||
|
||||
#[derive(Debug, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct ScrapeRequest {
|
||||
pub url: String,
|
||||
/// Output formats. Allowed: "markdown", "text", "llm", "json", "html".
|
||||
/// Defaults to ["markdown"]. Accepts a single string ("format")
|
||||
/// or an array ("formats") for hosted-API compatibility.
|
||||
#[serde(alias = "format")]
|
||||
pub formats: ScrapeFormats,
|
||||
pub include_selectors: Vec<String>,
|
||||
pub exclude_selectors: Vec<String>,
|
||||
pub only_main_content: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum ScrapeFormats {
|
||||
One(String),
|
||||
Many(Vec<String>),
|
||||
}
|
||||
|
||||
impl Default for ScrapeFormats {
|
||||
fn default() -> Self {
|
||||
Self::Many(vec!["markdown".into()])
|
||||
}
|
||||
}
|
||||
|
||||
impl ScrapeFormats {
|
||||
fn as_vec(&self) -> Vec<String> {
|
||||
match self {
|
||||
Self::One(s) => vec![s.clone()],
|
||||
Self::Many(v) => v.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn scrape(
|
||||
State(state): State<AppState>,
|
||||
Json(req): Json<ScrapeRequest>,
|
||||
) -> Result<Json<Value>, ApiError> {
|
||||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
let formats = req.formats.as_vec();
|
||||
|
||||
let options = ExtractionOptions {
|
||||
include_selectors: req.include_selectors,
|
||||
exclude_selectors: req.exclude_selectors,
|
||||
only_main_content: req.only_main_content,
|
||||
include_raw_html: formats.iter().any(|f| f == "html"),
|
||||
};
|
||||
|
||||
let extraction = state
|
||||
.fetch()
|
||||
.fetch_and_extract_with_options(&req.url, &options)
|
||||
.await?;
|
||||
|
||||
let mut body = json!({
|
||||
"url": extraction.metadata.url.clone().unwrap_or_else(|| req.url.clone()),
|
||||
"metadata": extraction.metadata,
|
||||
});
|
||||
let obj = body.as_object_mut().expect("json::object");
|
||||
|
||||
for f in &formats {
|
||||
match f.as_str() {
|
||||
"markdown" => {
|
||||
obj.insert("markdown".into(), json!(extraction.content.markdown));
|
||||
}
|
||||
"text" => {
|
||||
obj.insert("text".into(), json!(extraction.content.plain_text));
|
||||
}
|
||||
"llm" => {
|
||||
let llm = to_llm_text(&extraction, extraction.metadata.url.as_deref());
|
||||
obj.insert("llm".into(), json!(llm));
|
||||
}
|
||||
"html" => {
|
||||
if let Some(raw) = &extraction.content.raw_html {
|
||||
obj.insert("html".into(), json!(raw));
|
||||
}
|
||||
}
|
||||
"json" => {
|
||||
obj.insert("json".into(), json!(extraction));
|
||||
}
|
||||
other => {
|
||||
return Err(ApiError::bad_request(format!(
|
||||
"unknown format: '{other}' (allowed: markdown, text, llm, html, json)"
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !extraction.structured_data.is_empty() {
|
||||
obj.insert("structured_data".into(), json!(extraction.structured_data));
|
||||
}
|
||||
|
||||
Ok(Json(body))
|
||||
}
|
||||
52
crates/webclaw-server/src/routes/summarize.rs
Normal file
52
crates/webclaw-server/src/routes/summarize.rs
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
//! POST /v1/summarize — LLM-powered page summary.
|
||||
|
||||
use axum::{Json, extract::State};
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
use webclaw_llm::{ProviderChain, summarize::summarize};
|
||||
|
||||
use crate::{error::ApiError, state::AppState};
|
||||
|
||||
#[derive(Debug, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct SummarizeRequest {
|
||||
pub url: String,
|
||||
pub max_sentences: Option<usize>,
|
||||
pub model: Option<String>,
|
||||
}
|
||||
|
||||
pub async fn summarize_route(
|
||||
State(state): State<AppState>,
|
||||
Json(req): Json<SummarizeRequest>,
|
||||
) -> Result<Json<Value>, ApiError> {
|
||||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
|
||||
let extraction = state.fetch().fetch_and_extract(&req.url).await?;
|
||||
let content = if extraction.content.markdown.trim().is_empty() {
|
||||
extraction.content.plain_text.clone()
|
||||
} else {
|
||||
extraction.content.markdown.clone()
|
||||
};
|
||||
if content.trim().is_empty() {
|
||||
return Err(ApiError::Extract(
|
||||
"no extractable content on page".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let chain = ProviderChain::default().await;
|
||||
if chain.is_empty() {
|
||||
return Err(ApiError::Llm(
|
||||
"no LLM providers configured (set OLLAMA_HOST, OPENAI_API_KEY, or ANTHROPIC_API_KEY)"
|
||||
.to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let summary = summarize(&content, req.max_sentences, &chain, req.model.as_deref()).await?;
|
||||
|
||||
Ok(Json(json!({
|
||||
"url": req.url,
|
||||
"summary": summary,
|
||||
})))
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue