From de899ab3ba1cf11ca126572590e1d279c9bd3287 Mon Sep 17 00:00:00 2001 From: webclaw Date: Sat, 6 Jun 2026 14:20:03 +0200 Subject: [PATCH] feat(search): standalone web search with your own Serper.dev key OSS surfaces can now search without the hosted webclaw API. New webclaw-fetch::search() calls Serper.dev directly with a user-supplied key and optionally fetches + extracts the result pages. Wired into the CLI (webclaw search, --serper-key / SERPER_API_KEY), the MCP search tool (local-first when SERPER_API_KEY is set, cloud fallback otherwise), and the OSS reference server (POST /v1/search). Adds futures for concurrent result page scraping. --- Cargo.lock | 1 + crates/webclaw-cli/src/cli.rs | 37 +++ crates/webclaw-cli/src/main.rs | 36 ++- crates/webclaw-cli/src/run.rs | 67 +++++ crates/webclaw-fetch/Cargo.toml | 3 + crates/webclaw-fetch/src/lib.rs | 2 + crates/webclaw-fetch/src/search.rs | 322 +++++++++++++++++++++ crates/webclaw-mcp/src/server.rs | 52 +++- crates/webclaw-mcp/src/tools.rs | 11 +- crates/webclaw-server/src/error.rs | 10 + crates/webclaw-server/src/main.rs | 45 +++ crates/webclaw-server/src/routes/mod.rs | 6 + crates/webclaw-server/src/routes/search.rs | 68 +++++ crates/webclaw-server/src/state.rs | 18 ++ 14 files changed, 671 insertions(+), 7 deletions(-) create mode 100644 crates/webclaw-fetch/src/search.rs create mode 100644 crates/webclaw-server/src/routes/search.rs diff --git a/Cargo.lock b/Cargo.lock index 4acefe2..9ad7389 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3264,6 +3264,7 @@ dependencies = [ "bytes", "calamine", "flate2", + "futures", "http", "quick-xml 0.37.5", "rand 0.8.5", diff --git a/crates/webclaw-cli/src/cli.rs b/crates/webclaw-cli/src/cli.rs index 1221cdf..0ce5204 100644 --- a/crates/webclaw-cli/src/cli.rs +++ b/crates/webclaw-cli/src/cli.rs @@ -271,6 +271,43 @@ pub enum Commands { #[arg(long)] raw: bool, }, + + /// Web search via Serper.dev using YOUR OWN API key. + /// + /// Returns Google organic results (title, link, snippet). With + /// `--scrape`, each result page is fetched and extracted to markdown. + /// Get a free key at serper.dev, then pass `--serper-key` or set + /// `SERPER_API_KEY`. + /// + /// Example: `webclaw search "rust async runtime" --num 5 --scrape`. + Search { + /// Search query. + query: String, + + /// Serper.dev API key. Falls back to the `SERPER_API_KEY` env var. + #[arg(long, env = "SERPER_API_KEY")] + serper_key: Option, + + /// Number of results to return (1-10). + #[arg(long, default_value = "5")] + num: usize, + + /// Country code for localization (e.g. "us", "gb", "it"). + #[arg(long)] + country: Option, + + /// Language code for localization (e.g. "en", "it"). + #[arg(long)] + lang: Option, + + /// Fetch + extract each result page and include its markdown. + #[arg(long)] + scrape: bool, + + /// Output format: `markdown` (human-readable, default) or `json`. + #[arg(short, long, default_value = "markdown")] + format: OutputFormat, + }, } #[derive(Clone, ValueEnum)] diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 1a834e4..c93764d 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -21,7 +21,7 @@ use fetch::{ use output::{format_output, print_cloud_output, print_output}; use run::{ has_llm_flags, run_batch, run_batch_llm, run_brand, run_crawl, run_diff, run_llm, run_map, - run_research, run_watch, + run_research, run_search, run_watch, }; fn init_logging(verbose: bool) { @@ -145,6 +145,40 @@ async fn main() { } return; } + Commands::Search { + query, + serper_key, + num, + country, + lang, + scrape, + format, + } => { + let key = match serper_key { + Some(k) if !k.trim().is_empty() => k.clone(), + _ => { + eprintln!( + "error: search requires a Serper.dev API key: pass --serper-key or set SERPER_API_KEY (get one free at serper.dev)" + ); + process::exit(1); + } + }; + if let Err(e) = run_search( + &key, + query, + *num, + country.as_deref(), + lang.as_deref(), + *scrape, + format, + ) + .await + { + eprintln!("error: {e}"); + process::exit(1); + } + return; + } } } diff --git a/crates/webclaw-cli/src/run.rs b/crates/webclaw-cli/src/run.rs index e5a0bf3..7657e5a 100644 --- a/crates/webclaw-cli/src/run.rs +++ b/crates/webclaw-cli/src/run.rs @@ -229,6 +229,73 @@ pub async fn run_map(cli: &Cli) -> Result<(), String> { Ok(()) } +/// Web search via Serper.dev with the caller's own API key. +/// +/// The Serper key is resolved by the caller (flag or `SERPER_API_KEY` +/// env, via clap's `env`) and passed in already-unwrapped. When `scrape` +/// is set, each result page is fetched + extracted through a FetchClient +/// (which carries the browser TLS profile) and its markdown is included. +#[allow(clippy::too_many_arguments)] +pub async fn run_search( + serper_key: &str, + query: &str, + num: usize, + country: Option<&str>, + lang: Option<&str>, + scrape: bool, + format: &OutputFormat, +) -> Result<(), String> { + // Default fetch config is enough: search localization is handled by + // Serper's gl/hl, and the result-page scrape just needs a standard + // browser profile. Attach cloud fallback when WEBCLAW_API_KEY is set + // so scraped pages behind bot protection can still escalate. + let mut client = FetchClient::new(webclaw_fetch::FetchConfig::default()) + .map_err(|e| format!("client error: {e}"))?; + if let Some(cloud) = webclaw_fetch::cloud::CloudClient::from_env() { + client = client.with_cloud(cloud); + } + + let opts = webclaw_fetch::SearchOptions { + num_results: num, + country: country.map(str::to_string), + lang: lang.map(str::to_string), + scrape, + }; + + let results = webclaw_fetch::search(&client, serper_key, query, &opts) + .await + .map_err(|e| format!("search error: {e}"))?; + + if matches!(format, OutputFormat::Json) { + let json = serde_json::json!({ "query": query, "results": results }); + match serde_json::to_string_pretty(&json) { + Ok(s) => println!("{s}"), + Err(e) => return Err(format!("JSON encode failed: {e}")), + } + return Ok(()); + } + + if results.is_empty() { + eprintln!("no results for \"{query}\""); + return Ok(()); + } + + for r in &results { + println!("{}. {}", r.position, r.title); + println!(" {}", r.link); + if !r.snippet.is_empty() { + println!(" {}", r.snippet); + } + if let Some(ref content) = r.content { + println!(); + println!("{content}"); + } + println!(); + } + + Ok(()) +} + pub async fn run_batch(cli: &Cli, entries: &[(String, Option)]) -> Result<(), String> { let client = Arc::new( FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?, diff --git a/crates/webclaw-fetch/Cargo.toml b/crates/webclaw-fetch/Cargo.toml index dc2011f..abbf6f7 100644 --- a/crates/webclaw-fetch/Cargo.toml +++ b/crates/webclaw-fetch/Cargo.toml @@ -30,6 +30,9 @@ serde_json.workspace = true calamine = "0.34" zip = "2" flate2 = "1" +# Already in the dependency tree (transitive); used directly here for +# `join_all` to drive bounded-concurrent result-page scrapes in search. +futures = { version = "0.3", default-features = false } [dev-dependencies] tempfile = "3" diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index 9fb702a..3f86ae3 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -14,6 +14,7 @@ pub mod locale; pub mod map; pub mod proxy; pub mod reddit; +pub mod search; pub mod sitemap; pub mod tls; pub mod url_security; @@ -27,5 +28,6 @@ pub use http::HeaderMap; pub use locale::{accept_language_for_tld, accept_language_for_url}; pub use map::{MapOptions, discover_urls}; pub use proxy::{parse_proxy_file, parse_proxy_line}; +pub use search::{SearchOptions, SearchResult, parse_serper_organic, search}; pub use sitemap::SitemapEntry; pub use webclaw_pdf::PdfMode; diff --git a/crates/webclaw-fetch/src/search.rs b/crates/webclaw-fetch/src/search.rs new file mode 100644 index 0000000..1a03592 --- /dev/null +++ b/crates/webclaw-fetch/src/search.rs @@ -0,0 +1,322 @@ +//! Web search via Serper.dev (Google results) with optional content scraping. +//! +//! This is the self-hosted search path: the caller supplies their own +//! Serper.dev API key (free tier at serper.dev). The CLI, MCP server, and +//! OSS REST server all route through [`search`] so search works without the +//! hosted webclaw API. +//! +//! Serper returns a plain JSON API, so we hit it with a vanilla wreq client +//! (10s timeout) — no browser TLS fingerprinting needed. When `scrape` is +//! set, the top results are fetched through the caller's [`FetchClient`] +//! (which *does* carry the fingerprinting) and extracted to markdown. +use std::sync::Arc; +use std::time::Duration; + +use serde::{Deserialize, Serialize}; +use serde_json::{Value, json}; +use tokio::sync::Semaphore; +use tracing::warn; + +use crate::client::FetchClient; +use crate::error::FetchError; + +/// Serper.dev search endpoint. +const SERPER_URL: &str = "https://google.serper.dev/search"; + +/// Bound on the number of result pages scraped concurrently when +/// `scrape` is enabled. Keeps the fan-out from overwhelming the proxy +/// pool / remote hosts on a large result set. +const SCRAPE_CONCURRENCY: usize = 5; + +/// Options controlling a search request. +#[derive(Debug, Clone)] +pub struct SearchOptions { + /// Number of organic results to request (clamped to `1..=10`). + pub num_results: usize, + /// Country code for localization (Serper `gl`, e.g. `"us"`, `"gb"`). + pub country: Option, + /// Language code for localization (Serper `hl`, e.g. `"en"`, `"it"`). + pub lang: Option, + /// When true, fetch + extract the result pages and fill in `content`. + pub scrape: bool, +} + +impl Default for SearchOptions { + fn default() -> Self { + Self { + num_results: 5, + country: None, + lang: None, + scrape: false, + } + } +} + +/// A single organic search result. When `scrape` was requested and the +/// fetch succeeded, `content` holds the extracted markdown; otherwise it +/// is `None` (a per-result fetch failure never fails the whole search). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SearchResult { + pub title: String, + pub link: String, + pub snippet: String, + pub position: usize, + #[serde(skip_serializing_if = "Option::is_none")] + pub content: Option, +} + +/// Run a web search through Serper.dev. +/// +/// `client` — the caller's [`FetchClient`], used only when `opts.scrape` +/// is set (to fetch + extract the result pages). +/// `serper_key` — the caller's Serper.dev API key. +/// `query` — the search query. +/// `opts` — result count, localization, and whether to scrape. +/// +/// Returns the organic results in Serper's order. With `scrape` enabled, +/// the top results are fetched concurrently (bounded) and their extracted +/// markdown is attached to `content`. +pub async fn search( + client: &FetchClient, + serper_key: &str, + query: &str, + opts: &SearchOptions, +) -> Result, FetchError> { + let num = opts.num_results.clamp(1, 10); + + let response = call_serper( + serper_key, + query, + num, + opts.country.as_deref(), + opts.lang.as_deref(), + ) + .await?; + + let mut results = parse_serper_organic(&response); + + if opts.scrape && !results.is_empty() { + scrape_results(client, &mut results).await; + } + + Ok(results) +} + +/// POST the query to Serper.dev and return the raw JSON response. +/// +/// Builds a plain wreq client (no browser emulation — Serper is a JSON +/// API, not a bot-protected page). Non-2xx responses are surfaced as a +/// [`FetchError::Build`] carrying the status and body so the caller can +/// show Serper's own error (bad key, quota exceeded, etc.). +async fn call_serper( + api_key: &str, + query: &str, + num: usize, + country: Option<&str>, + lang: Option<&str>, +) -> Result { + let http = wreq::Client::builder() + .timeout(Duration::from_secs(10)) + .build() + .map_err(|e| FetchError::Build(format!("failed to build serper client: {e}")))?; + + let mut body = json!({ "q": query, "num": num }); + if let Some(gl) = country { + body["gl"] = json!(gl); + } + if let Some(hl) = lang { + body["hl"] = json!(hl); + } + // Serialize ourselves rather than `.json()` — the wreq `json` feature + // is not enabled in this crate and isn't worth pulling in for one call. + let payload = serde_json::to_vec(&body) + .map_err(|e| FetchError::Build(format!("serper request encode error: {e}")))?; + + let resp = http + .post(SERPER_URL) + .header("X-API-KEY", api_key) + .header("Content-Type", "application/json") + .body(payload) + .send() + .await?; + + let status = resp.status(); + if !status.is_success() { + let code = status.as_u16(); + let text = resp.text().await.unwrap_or_default(); + return Err(FetchError::Build(format!("serper returned {code}: {text}"))); + } + + let text = resp + .text() + .await + .map_err(|e| FetchError::BodyDecode(format!("serper response read error: {e}")))?; + serde_json::from_str::(&text) + .map_err(|e| FetchError::BodyDecode(format!("serper response parse error: {e}"))) +} + +/// Parse the `organic` array of a Serper response into [`SearchResult`]s. +/// +/// Pure (no network), so it is unit-tested against a fixture. Entries +/// missing `title` or `link` are skipped; `snippet` defaults to empty. +/// `position` is 1-based over the kept entries. +pub fn parse_serper_organic(response: &Value) -> Vec { + let Some(organic) = response.get("organic").and_then(|v| v.as_array()) else { + return Vec::new(); + }; + + organic + .iter() + .filter_map(|item| { + let title = item.get("title")?.as_str()?.to_string(); + let link = item.get("link")?.as_str()?.to_string(); + let snippet = item + .get("snippet") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + Some(SearchResult { + title, + link, + snippet, + // Filled in after collection so it tracks kept entries, + // not the raw array index (which may include skips). + position: 0, + content: None, + }) + }) + .enumerate() + .map(|(i, mut r)| { + r.position = i + 1; + r + }) + .collect() +} + +/// Fetch + extract the result pages and attach markdown to `content`. +/// +/// Bounded by [`SCRAPE_CONCURRENCY`]. A per-result fetch or extraction +/// failure leaves that result's `content` as `None` rather than failing +/// the whole search. +async fn scrape_results(client: &FetchClient, results: &mut [SearchResult]) { + let sem = Arc::new(Semaphore::new(SCRAPE_CONCURRENCY)); + + // Collect owned links first so the per-result futures don't borrow + // `results`. That keeps the future captures free of the slice's + // lifetime, which is what lets this compile inside the MCP `#[tool]` + // macro's stricter `Send`/lifetime bounds. + let links: Vec = results.iter().map(|r| r.link.clone()).collect(); + + let scrapes = links.into_iter().map(|link| { + let sem = sem.clone(); + async move { + // If the semaphore is closed (shutdown race), skip rather than panic. + let _permit = match sem.acquire().await { + Ok(p) => p, + Err(_) => return None, + }; + match client.fetch(&link).await { + Ok(fetched) => match webclaw_core::extract(&fetched.html, Some(&fetched.url)) { + Ok(extraction) => Some(extraction.content.markdown), + Err(e) => { + warn!(url = %link, error = %e, "search: extraction failed"); + None + } + }, + Err(e) => { + warn!(url = %link, error = %e, "search: fetch failed"); + None + } + } + } + }); + + // `join_all` drives every scrape future concurrently and returns + // results in input order; the semaphore caps how many fetches run at + // once. Result set is tiny (≤10), so the all-at-once poll is fine. + let contents = futures::future::join_all(scrapes).await; + for (r, content) in results.iter_mut().zip(contents) { + r.content = content; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn fixture() -> Value { + json!({ + "searchParameters": { "q": "rust async", "type": "search" }, + "organic": [ + { + "title": "Async Rust", + "link": "https://example.com/async", + "snippet": "Learn async in Rust.", + "position": 1 + }, + { + // snippet missing on purpose -> defaults to "" + "title": "Tokio", + "link": "https://tokio.rs" + }, + { + // no link -> skipped, must not shift positions of the rest + "title": "No Link Here" + } + ] + }) + } + + #[test] + fn parses_organic_results() { + let results = parse_serper_organic(&fixture()); + assert_eq!(results.len(), 2); + + assert_eq!(results[0].title, "Async Rust"); + assert_eq!(results[0].link, "https://example.com/async"); + assert_eq!(results[0].snippet, "Learn async in Rust."); + assert_eq!(results[0].position, 1); + assert!(results[0].content.is_none()); + + // Missing snippet -> empty string, and position is 1-based over + // kept entries (the link-less entry is dropped, not counted). + assert_eq!(results[1].title, "Tokio"); + assert_eq!(results[1].snippet, ""); + assert_eq!(results[1].position, 2); + } + + #[test] + fn missing_organic_key_yields_empty() { + assert!(parse_serper_organic(&json!({})).is_empty()); + assert!(parse_serper_organic(&json!({ "organic": "not-an-array" })).is_empty()); + } + + #[test] + fn search_result_serializes_without_null_content() { + let r = SearchResult { + title: "T".into(), + link: "https://e.com".into(), + snippet: "s".into(), + position: 1, + content: None, + }; + let v = serde_json::to_value(&r).unwrap(); + assert!(v.get("content").is_none(), "None content should be skipped"); + + let r2 = SearchResult { + content: Some("# md".into()), + ..r + }; + let v2 = serde_json::to_value(&r2).unwrap(); + assert_eq!(v2.get("content").and_then(|c| c.as_str()), Some("# md")); + } + + #[test] + fn default_options() { + let o = SearchOptions::default(); + assert_eq!(o.num_results, 5); + assert!(!o.scrape); + assert!(o.country.is_none()); + assert!(o.lang.is_none()); + } +} diff --git a/crates/webclaw-mcp/src/server.rs b/crates/webclaw-mcp/src/server.rs index 9a469aa..ed76920 100644 --- a/crates/webclaw-mcp/src/server.rs +++ b/crates/webclaw-mcp/src/server.rs @@ -650,13 +650,55 @@ impl WebclawMcp { )) } - /// Search the web for a query and return structured results. Requires WEBCLAW_API_KEY. + /// Search the web for a query and return structured results. + /// + /// Resolves the backend in priority order: + /// 1. `SERPER_API_KEY` set → local Serper.dev search with the user's + /// own key (no hosted API needed). Supports `country`, `lang`, and + /// `scrape` (fetch + extract each result page). + /// 2. else `WEBCLAW_API_KEY` set → the hosted webclaw search API. + /// 3. else → an error explaining both options. #[tool] async fn search(&self, Parameters(params): Parameters) -> Result { - let cloud = self - .cloud - .as_ref() - .ok_or("Search requires WEBCLAW_API_KEY. Get a key at https://webclaw.io")?; + // Local path: user's own Serper key. Preferred when present so the + // tool works without the hosted API and without spending credits. + if let Ok(serper_key) = std::env::var("SERPER_API_KEY") { + if !serper_key.trim().is_empty() { + let opts = webclaw_fetch::SearchOptions { + num_results: params.num_results.unwrap_or(5) as usize, + country: params.country.clone(), + lang: params.lang.clone(), + scrape: params.scrape.unwrap_or(false), + }; + let results = webclaw_fetch::search( + self.fetch_client.as_ref(), + &serper_key, + ¶ms.query, + &opts, + ) + .await + .map_err(|e| format!("search error: {e}"))?; + + let mut output = format!("Found {} results:\n\n", results.len()); + for r in &results { + output.push_str(&format!("{}. {}\n {}\n", r.position, r.title, r.link)); + if !r.snippet.is_empty() { + output.push_str(&format!(" {}\n", r.snippet)); + } + if let Some(ref content) = r.content { + output.push_str(&format!("\n{content}\n")); + } + output.push('\n'); + } + return Ok(output); + } + } + + // Hosted path: the webclaw cloud API. + let cloud = self.cloud.as_ref().ok_or( + "Search requires a search backend: set SERPER_API_KEY for local search \ + (get one free at serper.dev), or WEBCLAW_API_KEY for the hosted API.", + )?; let mut body = json!({ "query": params.query }); if let Some(num) = params.num_results { diff --git a/crates/webclaw-mcp/src/tools.rs b/crates/webclaw-mcp/src/tools.rs index 02bf534..a1d9446 100644 --- a/crates/webclaw-mcp/src/tools.rs +++ b/crates/webclaw-mcp/src/tools.rs @@ -100,8 +100,17 @@ pub struct ResearchParams { pub struct SearchParams { /// Search query pub query: String, - /// Number of results to return (default: 10) + /// Number of results to return (default: 5, max: 10) pub num_results: Option, + /// Country code for localization (e.g. "us", "gb", "it"). + /// Only used by the local Serper path (SERPER_API_KEY). + pub country: Option, + /// Language code for localization (e.g. "en", "it"). + /// Only used by the local Serper path (SERPER_API_KEY). + pub lang: Option, + /// When true, fetch + extract each result page and include its + /// markdown. Only used by the local Serper path (SERPER_API_KEY). + pub scrape: Option, } /// Parameters for `vertical_scrape`: run a site-specific extractor by name. diff --git a/crates/webclaw-server/src/error.rs b/crates/webclaw-server/src/error.rs index a63848f..95c858e 100644 --- a/crates/webclaw-server/src/error.rs +++ b/crates/webclaw-server/src/error.rs @@ -38,6 +38,9 @@ pub enum ApiError { #[error("internal: {0}")] Internal(String), + + #[error("{0}")] + NotImplemented(String), } impl ApiError { @@ -48,6 +51,12 @@ impl ApiError { pub fn internal(msg: impl Into) -> Self { Self::Internal(msg.into()) } + /// 501 — a capability the operator hasn't configured (e.g. search + /// without `SERPER_API_KEY`). Distinct from `BadRequest` (client's + /// fault) and `Internal` (our fault): it's a deployment-config gap. + pub fn not_implemented(msg: impl Into) -> Self { + Self::NotImplemented(msg.into()) + } fn status(&self) -> StatusCode { match self { @@ -57,6 +66,7 @@ impl ApiError { Self::Fetch(_) => StatusCode::BAD_GATEWAY, Self::Extract(_) | Self::Llm(_) => StatusCode::UNPROCESSABLE_ENTITY, Self::Internal(_) => StatusCode::INTERNAL_SERVER_ERROR, + Self::NotImplemented(_) => StatusCode::NOT_IMPLEMENTED, } } } diff --git a/crates/webclaw-server/src/main.rs b/crates/webclaw-server/src/main.rs index 0053db5..8da3764 100644 --- a/crates/webclaw-server/src/main.rs +++ b/crates/webclaw-server/src/main.rs @@ -123,6 +123,7 @@ fn build_app(state: AppState) -> Router { ) .route("/crawl", post(routes::crawl::crawl)) .route("/map", post(routes::map::map)) + .route("/search", post(routes::search::search)) .route("/batch", post(routes::batch::batch)) .route("/extract", post(routes::extract::extract)) .route("/extractors", get(routes::structured::list_extractors)) @@ -289,4 +290,48 @@ mod tests { "expected unknown-format error, got {body:?}" ); } + + fn post_json(uri: &str, body: &str) -> Request { + Request::builder() + .method("POST") + .uri(uri) + .header("content-type", "application/json") + .body(Body::from(body.to_owned())) + .expect("request") + } + + #[tokio::test] + async fn search_empty_query_is_bad_request() { + // The empty-query guard runs before the key check, so this is + // hermetic regardless of whether SERPER_API_KEY is set. + let app = app_with_key(None).await; + let resp = app + .oneshot(post_json("/v1/search", r#"{"query":" "}"#)) + .await + .expect("response"); + assert_eq!(resp.status(), StatusCode::BAD_REQUEST); + } + + #[tokio::test] + async fn search_without_serper_key_is_not_implemented() { + // Only meaningful when the operator hasn't configured a key. + // Skip if the test environment happens to set SERPER_API_KEY so + // we don't make a live Serper call from the test suite. + if std::env::var("SERPER_API_KEY").is_ok_and(|k| !k.trim().is_empty()) { + return; + } + let app = app_with_key(None).await; + let resp = app + .oneshot(post_json("/v1/search", r#"{"query":"rust"}"#)) + .await + .expect("response"); + assert_eq!(resp.status(), StatusCode::NOT_IMPLEMENTED); + let body = json_body(resp).await; + assert!( + body["error"] + .as_str() + .is_some_and(|e| e.contains("SERPER_API_KEY")), + "expected serper setup hint, got {body:?}" + ); + } } diff --git a/crates/webclaw-server/src/routes/mod.rs b/crates/webclaw-server/src/routes/mod.rs index 01f1052..3ed2273 100644 --- a/crates/webclaw-server/src/routes/mod.rs +++ b/crates/webclaw-server/src/routes/mod.rs @@ -6,6 +6,11 @@ //! (anti-bot bypass with stealth Chrome, JS rendering at scale, //! per-user auth, billing, async job queues, agent loops) are //! intentionally not implemented here. Use api.webclaw.io for those. +//! +//! `POST /v1/search` is supported when the operator supplies their own +//! Serper.dev API key via the `SERPER_API_KEY` env var (free key at +//! serper.dev). Without it, the route returns 501. This is the +//! bring-your-own-key path — no hosted webclaw account required. pub mod batch; pub mod brand; @@ -15,5 +20,6 @@ pub mod extract; pub mod health; pub mod map; pub mod scrape; +pub mod search; pub mod structured; pub mod summarize; diff --git a/crates/webclaw-server/src/routes/search.rs b/crates/webclaw-server/src/routes/search.rs new file mode 100644 index 0000000..5bc480e --- /dev/null +++ b/crates/webclaw-server/src/routes/search.rs @@ -0,0 +1,68 @@ +//! POST /v1/search — web search via Serper.dev using the operator's own key. +//! +//! Enabled only when the server is started with `SERPER_API_KEY` set +//! (get a free key at serper.dev). Without it, this route returns 501 so +//! self-hosters know the capability exists but isn't configured. +//! +//! With `scrape: true`, each result page is fetched + extracted to +//! markdown via the shared [`webclaw_fetch::FetchClient`]. A per-result +//! fetch failure leaves that result's `content` null; it never fails the +//! whole search. + +use axum::{Json, extract::State}; +use serde::Deserialize; +use serde_json::{Value, json}; + +use crate::{error::ApiError, state::AppState}; + +#[derive(Debug, Deserialize)] +pub struct SearchRequest { + pub query: String, + /// Max results to return (default 5, clamped to 1..=10). + #[serde(default = "default_num_results")] + pub num_results: usize, + /// Country code for localization (e.g. "us", "gb", "it"). + pub country: Option, + /// Language code for localization (e.g. "en", "it"). + pub lang: Option, + /// When true, fetch + extract each result page and include its markdown. + #[serde(default)] + pub scrape: bool, +} + +fn default_num_results() -> usize { + 5 +} + +pub async fn search( + State(state): State, + Json(req): Json, +) -> Result, ApiError> { + if req.query.trim().is_empty() { + return Err(ApiError::bad_request("`query` is required")); + } + + let serper_key = state.serper_api_key().ok_or_else(|| { + ApiError::not_implemented( + "search is not configured: start the server with SERPER_API_KEY set \ + (get a free key at serper.dev)", + ) + })?; + + let opts = webclaw_fetch::SearchOptions { + num_results: req.num_results, + country: req.country.clone(), + lang: req.lang.clone(), + scrape: req.scrape, + }; + + let results = webclaw_fetch::search(state.fetch(), serper_key, &req.query, &opts) + .await + .map_err(|e| ApiError::internal(format!("search failed: {e}")))?; + + Ok(Json(json!({ + "query": req.query, + "count": results.len(), + "results": results, + }))) +} diff --git a/crates/webclaw-server/src/state.rs b/crates/webclaw-server/src/state.rs index 9807a04..afa304c 100644 --- a/crates/webclaw-server/src/state.rs +++ b/crates/webclaw-server/src/state.rs @@ -47,6 +47,9 @@ struct Inner { pub llm_chain: Arc, /// Inbound bearer-auth token for this server's own `/v1/*` surface. pub api_key: Option, + /// Operator's own Serper.dev API key, read from `SERPER_API_KEY`. + /// Enables `/v1/search`. Unset = `/v1/search` returns 501. + pub serper_api_key: Option, } impl AppState { @@ -82,12 +85,22 @@ impl AppState { let llm_chain = Arc::new(ProviderChain::default().await); + // Operator's own Serper.dev key enables /v1/search. Empty/unset + // leaves search returning 501 with a setup hint. + let serper_api_key = std::env::var("SERPER_API_KEY") + .ok() + .filter(|k| !k.trim().is_empty()); + if serper_api_key.is_some() { + info!("search enabled — using SERPER_API_KEY for /v1/search"); + } + Ok(Self { inner: Arc::new(Inner { fetch: Arc::new(fetch), fetch_config: config, llm_chain, api_key: inbound_api_key, + serper_api_key, }), }) } @@ -112,6 +125,11 @@ impl AppState { pub fn api_key(&self) -> Option<&str> { self.inner.api_key.as_deref() } + + /// Operator's Serper.dev key for `/v1/search`, if configured. + pub fn serper_api_key(&self) -> Option<&str> { + self.inner.serper_api_key.as_deref() + } } /// Resolve the outbound cloud key. Prefers `WEBCLAW_CLOUD_API_KEY`;