mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-28 03:29:38 +02:00
Merge pull request #63 from 0xMassi/feat/standalone-search
feat(search): standalone web search via Serper.dev (bring-your-own-key)
This commit is contained in:
commit
c3e5ef5143
10 changed files with 622 additions and 7 deletions
|
|
@ -410,6 +410,43 @@ enum Commands {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
raw: bool,
|
raw: bool,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/// Web search via Serper.dev using YOUR OWN API key.
|
||||||
|
///
|
||||||
|
/// Returns Google organic results (title, link, snippet). With
|
||||||
|
/// `--scrape`, each result page is fetched and extracted to markdown.
|
||||||
|
/// Get a free key at serper.dev, then pass `--serper-key` or set
|
||||||
|
/// `SERPER_API_KEY`.
|
||||||
|
///
|
||||||
|
/// Example: `webclaw search "rust async runtime" --num 5 --scrape`.
|
||||||
|
Search {
|
||||||
|
/// Search query.
|
||||||
|
query: String,
|
||||||
|
|
||||||
|
/// Serper.dev API key. Falls back to the `SERPER_API_KEY` env var.
|
||||||
|
#[arg(long, env = "SERPER_API_KEY")]
|
||||||
|
serper_key: Option<String>,
|
||||||
|
|
||||||
|
/// Number of results to return (1-10).
|
||||||
|
#[arg(long, default_value = "5")]
|
||||||
|
num: usize,
|
||||||
|
|
||||||
|
/// Country code for localization (e.g. "us", "gb", "it").
|
||||||
|
#[arg(long)]
|
||||||
|
country: Option<String>,
|
||||||
|
|
||||||
|
/// Language code for localization (e.g. "en", "it").
|
||||||
|
#[arg(long)]
|
||||||
|
lang: Option<String>,
|
||||||
|
|
||||||
|
/// Fetch + extract each result page and include its markdown.
|
||||||
|
#[arg(long)]
|
||||||
|
scrape: bool,
|
||||||
|
|
||||||
|
/// Output format: `markdown` (human-readable, default) or `json`.
|
||||||
|
#[arg(short, long, default_value = "markdown")]
|
||||||
|
format: OutputFormat,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, ValueEnum)]
|
#[derive(Clone, ValueEnum)]
|
||||||
|
|
@ -1573,6 +1610,73 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Web search via Serper.dev with the caller's own API key.
|
||||||
|
///
|
||||||
|
/// The Serper key is resolved by the caller (flag or `SERPER_API_KEY`
|
||||||
|
/// env, via clap's `env`) and passed in already-unwrapped. When `scrape`
|
||||||
|
/// is set, each result page is fetched + extracted through a FetchClient
|
||||||
|
/// (which carries the browser TLS profile) and its markdown is included.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
async fn run_search(
|
||||||
|
serper_key: &str,
|
||||||
|
query: &str,
|
||||||
|
num: usize,
|
||||||
|
country: Option<&str>,
|
||||||
|
lang: Option<&str>,
|
||||||
|
scrape: bool,
|
||||||
|
format: &OutputFormat,
|
||||||
|
) -> Result<(), String> {
|
||||||
|
// Default fetch config is enough: search localization is handled by
|
||||||
|
// Serper's gl/hl, and the result-page scrape just needs a standard
|
||||||
|
// browser profile. Attach cloud fallback when WEBCLAW_API_KEY is set
|
||||||
|
// so scraped pages behind bot protection can still escalate.
|
||||||
|
let mut client = webclaw_fetch::FetchClient::new(webclaw_fetch::FetchConfig::default())
|
||||||
|
.map_err(|e| format!("client error: {e}"))?;
|
||||||
|
if let Some(cloud) = webclaw_fetch::cloud::CloudClient::from_env() {
|
||||||
|
client = client.with_cloud(cloud);
|
||||||
|
}
|
||||||
|
|
||||||
|
let opts = webclaw_fetch::SearchOptions {
|
||||||
|
num_results: num,
|
||||||
|
country: country.map(str::to_string),
|
||||||
|
lang: lang.map(str::to_string),
|
||||||
|
scrape,
|
||||||
|
};
|
||||||
|
|
||||||
|
let results = webclaw_fetch::search(&client, serper_key, query, &opts)
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("search error: {e}"))?;
|
||||||
|
|
||||||
|
if matches!(format, OutputFormat::Json) {
|
||||||
|
let json = serde_json::json!({ "query": query, "results": results });
|
||||||
|
match serde_json::to_string_pretty(&json) {
|
||||||
|
Ok(s) => println!("{s}"),
|
||||||
|
Err(e) => return Err(format!("JSON encode failed: {e}")),
|
||||||
|
}
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
if results.is_empty() {
|
||||||
|
eprintln!("no results for \"{query}\"");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
for r in &results {
|
||||||
|
println!("{}. {}", r.position, r.title);
|
||||||
|
println!(" {}", r.link);
|
||||||
|
if !r.snippet.is_empty() {
|
||||||
|
println!(" {}", r.snippet);
|
||||||
|
}
|
||||||
|
if let Some(ref content) = r.content {
|
||||||
|
println!();
|
||||||
|
println!("{content}");
|
||||||
|
}
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
async fn run_map(cli: &Cli) -> Result<(), String> {
|
async fn run_map(cli: &Cli) -> Result<(), String> {
|
||||||
let url = cli
|
let url = cli
|
||||||
.urls
|
.urls
|
||||||
|
|
@ -2589,6 +2693,40 @@ async fn main() {
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
Commands::Search {
|
||||||
|
query,
|
||||||
|
serper_key,
|
||||||
|
num,
|
||||||
|
country,
|
||||||
|
lang,
|
||||||
|
scrape,
|
||||||
|
format,
|
||||||
|
} => {
|
||||||
|
let key = match serper_key {
|
||||||
|
Some(k) if !k.trim().is_empty() => k.clone(),
|
||||||
|
_ => {
|
||||||
|
eprintln!(
|
||||||
|
"error: search requires a Serper.dev API key: pass --serper-key or set SERPER_API_KEY (get one free at serper.dev)"
|
||||||
|
);
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if let Err(e) = run_search(
|
||||||
|
&key,
|
||||||
|
query,
|
||||||
|
*num,
|
||||||
|
country.as_deref(),
|
||||||
|
lang.as_deref(),
|
||||||
|
*scrape,
|
||||||
|
format,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
eprintln!("error: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@ pub mod locale;
|
||||||
pub mod progress;
|
pub mod progress;
|
||||||
pub mod proxy;
|
pub mod proxy;
|
||||||
pub mod reddit;
|
pub mod reddit;
|
||||||
|
pub mod search;
|
||||||
pub mod sitemap;
|
pub mod sitemap;
|
||||||
pub mod tls;
|
pub mod tls;
|
||||||
pub mod url_security;
|
pub mod url_security;
|
||||||
|
|
@ -27,5 +28,6 @@ pub use http::HeaderMap;
|
||||||
pub use locale::{accept_language_for_tld, accept_language_for_url};
|
pub use locale::{accept_language_for_tld, accept_language_for_url};
|
||||||
pub use progress::{PROGRESS_INTERVAL, with_progress};
|
pub use progress::{PROGRESS_INTERVAL, with_progress};
|
||||||
pub use proxy::{parse_proxy_file, parse_proxy_line};
|
pub use proxy::{parse_proxy_file, parse_proxy_line};
|
||||||
|
pub use search::{SearchOptions, SearchResult, parse_serper_organic, search};
|
||||||
pub use sitemap::SitemapEntry;
|
pub use sitemap::SitemapEntry;
|
||||||
pub use webclaw_pdf::PdfMode;
|
pub use webclaw_pdf::PdfMode;
|
||||||
|
|
|
||||||
322
crates/webclaw-fetch/src/search.rs
Normal file
322
crates/webclaw-fetch/src/search.rs
Normal file
|
|
@ -0,0 +1,322 @@
|
||||||
|
//! Web search via Serper.dev (Google results) with optional content scraping.
|
||||||
|
//!
|
||||||
|
//! This is the self-hosted search path: the caller supplies their own
|
||||||
|
//! Serper.dev API key (free tier at serper.dev). The CLI, MCP server, and
|
||||||
|
//! OSS REST server all route through [`search`] so search works without the
|
||||||
|
//! hosted webclaw API.
|
||||||
|
//!
|
||||||
|
//! Serper returns a plain JSON API, so we hit it with a vanilla wreq client
|
||||||
|
//! (10s timeout) — no browser TLS fingerprinting needed. When `scrape` is
|
||||||
|
//! set, the top results are fetched through the caller's [`FetchClient`]
|
||||||
|
//! (which *does* carry the fingerprinting) and extracted to markdown.
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_json::{Value, json};
|
||||||
|
use tokio::sync::Semaphore;
|
||||||
|
use tracing::warn;
|
||||||
|
|
||||||
|
use crate::client::FetchClient;
|
||||||
|
use crate::error::FetchError;
|
||||||
|
|
||||||
|
/// Serper.dev search endpoint.
|
||||||
|
const SERPER_URL: &str = "https://google.serper.dev/search";
|
||||||
|
|
||||||
|
/// Bound on the number of result pages scraped concurrently when
|
||||||
|
/// `scrape` is enabled. Keeps the fan-out from overwhelming the proxy
|
||||||
|
/// pool / remote hosts on a large result set.
|
||||||
|
const SCRAPE_CONCURRENCY: usize = 5;
|
||||||
|
|
||||||
|
/// Options controlling a search request.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct SearchOptions {
|
||||||
|
/// Number of organic results to request (clamped to `1..=10`).
|
||||||
|
pub num_results: usize,
|
||||||
|
/// Country code for localization (Serper `gl`, e.g. `"us"`, `"gb"`).
|
||||||
|
pub country: Option<String>,
|
||||||
|
/// Language code for localization (Serper `hl`, e.g. `"en"`, `"it"`).
|
||||||
|
pub lang: Option<String>,
|
||||||
|
/// When true, fetch + extract the result pages and fill in `content`.
|
||||||
|
pub scrape: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for SearchOptions {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
num_results: 5,
|
||||||
|
country: None,
|
||||||
|
lang: None,
|
||||||
|
scrape: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A single organic search result. When `scrape` was requested and the
|
||||||
|
/// fetch succeeded, `content` holds the extracted markdown; otherwise it
|
||||||
|
/// is `None` (a per-result fetch failure never fails the whole search).
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct SearchResult {
|
||||||
|
pub title: String,
|
||||||
|
pub link: String,
|
||||||
|
pub snippet: String,
|
||||||
|
pub position: usize,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub content: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run a web search through Serper.dev.
|
||||||
|
///
|
||||||
|
/// `client` — the caller's [`FetchClient`], used only when `opts.scrape`
|
||||||
|
/// is set (to fetch + extract the result pages).
|
||||||
|
/// `serper_key` — the caller's Serper.dev API key.
|
||||||
|
/// `query` — the search query.
|
||||||
|
/// `opts` — result count, localization, and whether to scrape.
|
||||||
|
///
|
||||||
|
/// Returns the organic results in Serper's order. With `scrape` enabled,
|
||||||
|
/// the top results are fetched concurrently (bounded) and their extracted
|
||||||
|
/// markdown is attached to `content`.
|
||||||
|
pub async fn search(
|
||||||
|
client: &FetchClient,
|
||||||
|
serper_key: &str,
|
||||||
|
query: &str,
|
||||||
|
opts: &SearchOptions,
|
||||||
|
) -> Result<Vec<SearchResult>, FetchError> {
|
||||||
|
let num = opts.num_results.clamp(1, 10);
|
||||||
|
|
||||||
|
let response = call_serper(
|
||||||
|
serper_key,
|
||||||
|
query,
|
||||||
|
num,
|
||||||
|
opts.country.as_deref(),
|
||||||
|
opts.lang.as_deref(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let mut results = parse_serper_organic(&response);
|
||||||
|
|
||||||
|
if opts.scrape && !results.is_empty() {
|
||||||
|
scrape_results(client, &mut results).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(results)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// POST the query to Serper.dev and return the raw JSON response.
|
||||||
|
///
|
||||||
|
/// Builds a plain wreq client (no browser emulation — Serper is a JSON
|
||||||
|
/// API, not a bot-protected page). Non-2xx responses are surfaced as a
|
||||||
|
/// [`FetchError::Build`] carrying the status and body so the caller can
|
||||||
|
/// show Serper's own error (bad key, quota exceeded, etc.).
|
||||||
|
async fn call_serper(
|
||||||
|
api_key: &str,
|
||||||
|
query: &str,
|
||||||
|
num: usize,
|
||||||
|
country: Option<&str>,
|
||||||
|
lang: Option<&str>,
|
||||||
|
) -> Result<Value, FetchError> {
|
||||||
|
let http = wreq::Client::builder()
|
||||||
|
.timeout(Duration::from_secs(10))
|
||||||
|
.build()
|
||||||
|
.map_err(|e| FetchError::Build(format!("failed to build serper client: {e}")))?;
|
||||||
|
|
||||||
|
let mut body = json!({ "q": query, "num": num });
|
||||||
|
if let Some(gl) = country {
|
||||||
|
body["gl"] = json!(gl);
|
||||||
|
}
|
||||||
|
if let Some(hl) = lang {
|
||||||
|
body["hl"] = json!(hl);
|
||||||
|
}
|
||||||
|
// Serialize ourselves rather than `.json()` — the wreq `json` feature
|
||||||
|
// is not enabled in this crate and isn't worth pulling in for one call.
|
||||||
|
let payload = serde_json::to_vec(&body)
|
||||||
|
.map_err(|e| FetchError::Build(format!("serper request encode error: {e}")))?;
|
||||||
|
|
||||||
|
let resp = http
|
||||||
|
.post(SERPER_URL)
|
||||||
|
.header("X-API-KEY", api_key)
|
||||||
|
.header("Content-Type", "application/json")
|
||||||
|
.body(payload)
|
||||||
|
.send()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let status = resp.status();
|
||||||
|
if !status.is_success() {
|
||||||
|
let code = status.as_u16();
|
||||||
|
let text = resp.text().await.unwrap_or_default();
|
||||||
|
return Err(FetchError::Build(format!("serper returned {code}: {text}")));
|
||||||
|
}
|
||||||
|
|
||||||
|
let text = resp
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.map_err(|e| FetchError::BodyDecode(format!("serper response read error: {e}")))?;
|
||||||
|
serde_json::from_str::<Value>(&text)
|
||||||
|
.map_err(|e| FetchError::BodyDecode(format!("serper response parse error: {e}")))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse the `organic` array of a Serper response into [`SearchResult`]s.
|
||||||
|
///
|
||||||
|
/// Pure (no network), so it is unit-tested against a fixture. Entries
|
||||||
|
/// missing `title` or `link` are skipped; `snippet` defaults to empty.
|
||||||
|
/// `position` is 1-based over the kept entries.
|
||||||
|
pub fn parse_serper_organic(response: &Value) -> Vec<SearchResult> {
|
||||||
|
let Some(organic) = response.get("organic").and_then(|v| v.as_array()) else {
|
||||||
|
return Vec::new();
|
||||||
|
};
|
||||||
|
|
||||||
|
organic
|
||||||
|
.iter()
|
||||||
|
.filter_map(|item| {
|
||||||
|
let title = item.get("title")?.as_str()?.to_string();
|
||||||
|
let link = item.get("link")?.as_str()?.to_string();
|
||||||
|
let snippet = item
|
||||||
|
.get("snippet")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string();
|
||||||
|
Some(SearchResult {
|
||||||
|
title,
|
||||||
|
link,
|
||||||
|
snippet,
|
||||||
|
// Filled in after collection so it tracks kept entries,
|
||||||
|
// not the raw array index (which may include skips).
|
||||||
|
position: 0,
|
||||||
|
content: None,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, mut r)| {
|
||||||
|
r.position = i + 1;
|
||||||
|
r
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetch + extract the result pages and attach markdown to `content`.
|
||||||
|
///
|
||||||
|
/// Bounded by [`SCRAPE_CONCURRENCY`]. A per-result fetch or extraction
|
||||||
|
/// failure leaves that result's `content` as `None` rather than failing
|
||||||
|
/// the whole search.
|
||||||
|
async fn scrape_results(client: &FetchClient, results: &mut [SearchResult]) {
|
||||||
|
let sem = Arc::new(Semaphore::new(SCRAPE_CONCURRENCY));
|
||||||
|
|
||||||
|
// Collect owned links first so the per-result futures don't borrow
|
||||||
|
// `results`. That keeps the future captures free of the slice's
|
||||||
|
// lifetime, which is what lets this compile inside the MCP `#[tool]`
|
||||||
|
// macro's stricter `Send`/lifetime bounds.
|
||||||
|
let links: Vec<String> = results.iter().map(|r| r.link.clone()).collect();
|
||||||
|
|
||||||
|
let scrapes = links.into_iter().map(|link| {
|
||||||
|
let sem = sem.clone();
|
||||||
|
async move {
|
||||||
|
// If the semaphore is closed (shutdown race), skip rather than panic.
|
||||||
|
let _permit = match sem.acquire().await {
|
||||||
|
Ok(p) => p,
|
||||||
|
Err(_) => return None,
|
||||||
|
};
|
||||||
|
match client.fetch(&link).await {
|
||||||
|
Ok(fetched) => match webclaw_core::extract(&fetched.html, Some(&fetched.url)) {
|
||||||
|
Ok(extraction) => Some(extraction.content.markdown),
|
||||||
|
Err(e) => {
|
||||||
|
warn!(url = %link, error = %e, "search: extraction failed");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
warn!(url = %link, error = %e, "search: fetch failed");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// `join_all` drives every scrape future concurrently and returns
|
||||||
|
// results in input order; the semaphore caps how many fetches run at
|
||||||
|
// once. Result set is tiny (≤10), so the all-at-once poll is fine.
|
||||||
|
let contents = futures_util::future::join_all(scrapes).await;
|
||||||
|
for (r, content) in results.iter_mut().zip(contents) {
|
||||||
|
r.content = content;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn fixture() -> Value {
|
||||||
|
json!({
|
||||||
|
"searchParameters": { "q": "rust async", "type": "search" },
|
||||||
|
"organic": [
|
||||||
|
{
|
||||||
|
"title": "Async Rust",
|
||||||
|
"link": "https://example.com/async",
|
||||||
|
"snippet": "Learn async in Rust.",
|
||||||
|
"position": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// snippet missing on purpose -> defaults to ""
|
||||||
|
"title": "Tokio",
|
||||||
|
"link": "https://tokio.rs"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// no link -> skipped, must not shift positions of the rest
|
||||||
|
"title": "No Link Here"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parses_organic_results() {
|
||||||
|
let results = parse_serper_organic(&fixture());
|
||||||
|
assert_eq!(results.len(), 2);
|
||||||
|
|
||||||
|
assert_eq!(results[0].title, "Async Rust");
|
||||||
|
assert_eq!(results[0].link, "https://example.com/async");
|
||||||
|
assert_eq!(results[0].snippet, "Learn async in Rust.");
|
||||||
|
assert_eq!(results[0].position, 1);
|
||||||
|
assert!(results[0].content.is_none());
|
||||||
|
|
||||||
|
// Missing snippet -> empty string, and position is 1-based over
|
||||||
|
// kept entries (the link-less entry is dropped, not counted).
|
||||||
|
assert_eq!(results[1].title, "Tokio");
|
||||||
|
assert_eq!(results[1].snippet, "");
|
||||||
|
assert_eq!(results[1].position, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn missing_organic_key_yields_empty() {
|
||||||
|
assert!(parse_serper_organic(&json!({})).is_empty());
|
||||||
|
assert!(parse_serper_organic(&json!({ "organic": "not-an-array" })).is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn search_result_serializes_without_null_content() {
|
||||||
|
let r = SearchResult {
|
||||||
|
title: "T".into(),
|
||||||
|
link: "https://e.com".into(),
|
||||||
|
snippet: "s".into(),
|
||||||
|
position: 1,
|
||||||
|
content: None,
|
||||||
|
};
|
||||||
|
let v = serde_json::to_value(&r).unwrap();
|
||||||
|
assert!(v.get("content").is_none(), "None content should be skipped");
|
||||||
|
|
||||||
|
let r2 = SearchResult {
|
||||||
|
content: Some("# md".into()),
|
||||||
|
..r
|
||||||
|
};
|
||||||
|
let v2 = serde_json::to_value(&r2).unwrap();
|
||||||
|
assert_eq!(v2.get("content").and_then(|c| c.as_str()), Some("# md"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn default_options() {
|
||||||
|
let o = SearchOptions::default();
|
||||||
|
assert_eq!(o.num_results, 5);
|
||||||
|
assert!(!o.scrape);
|
||||||
|
assert!(o.country.is_none());
|
||||||
|
assert!(o.lang.is_none());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -668,13 +668,55 @@ impl WebclawMcp {
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Search the web for a query and return structured results. Requires WEBCLAW_API_KEY.
|
/// Search the web for a query and return structured results.
|
||||||
|
///
|
||||||
|
/// Resolves the backend in priority order:
|
||||||
|
/// 1. `SERPER_API_KEY` set → local Serper.dev search with the user's
|
||||||
|
/// own key (no hosted API needed). Supports `country`, `lang`, and
|
||||||
|
/// `scrape` (fetch + extract each result page).
|
||||||
|
/// 2. else `WEBCLAW_API_KEY` set → the hosted webclaw search API.
|
||||||
|
/// 3. else → an error explaining both options.
|
||||||
#[tool]
|
#[tool]
|
||||||
async fn search(&self, Parameters(params): Parameters<SearchParams>) -> Result<String, String> {
|
async fn search(&self, Parameters(params): Parameters<SearchParams>) -> Result<String, String> {
|
||||||
let cloud = self
|
// Local path: user's own Serper key. Preferred when present so the
|
||||||
.cloud
|
// tool works without the hosted API and without spending credits.
|
||||||
.as_ref()
|
if let Ok(serper_key) = std::env::var("SERPER_API_KEY")
|
||||||
.ok_or("Search requires WEBCLAW_API_KEY. Get a key at https://webclaw.io")?;
|
&& !serper_key.trim().is_empty()
|
||||||
|
{
|
||||||
|
let opts = webclaw_fetch::SearchOptions {
|
||||||
|
num_results: params.num_results.unwrap_or(5) as usize,
|
||||||
|
country: params.country.clone(),
|
||||||
|
lang: params.lang.clone(),
|
||||||
|
scrape: params.scrape.unwrap_or(false),
|
||||||
|
};
|
||||||
|
let results = webclaw_fetch::search(
|
||||||
|
self.fetch_client.as_ref(),
|
||||||
|
&serper_key,
|
||||||
|
¶ms.query,
|
||||||
|
&opts,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("search error: {e}"))?;
|
||||||
|
|
||||||
|
let mut output = format!("Found {} results:\n\n", results.len());
|
||||||
|
for r in &results {
|
||||||
|
output.push_str(&format!("{}. {}\n {}\n", r.position, r.title, r.link));
|
||||||
|
if !r.snippet.is_empty() {
|
||||||
|
output.push_str(&format!(" {}\n", r.snippet));
|
||||||
|
}
|
||||||
|
if let Some(ref content) = r.content {
|
||||||
|
output.push_str(&format!("\n{content}\n"));
|
||||||
|
}
|
||||||
|
output.push('\n');
|
||||||
|
}
|
||||||
|
return Ok(output);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hosted path: the webclaw cloud API.
|
||||||
|
let cloud = self.cloud.as_ref().ok_or(
|
||||||
|
"Search requires a search backend: set SERPER_API_KEY for local search \
|
||||||
|
(get one free at serper.dev), or WEBCLAW_API_KEY for the hosted API.",
|
||||||
|
)?;
|
||||||
|
|
||||||
let mut body = json!({ "query": params.query });
|
let mut body = json!({ "query": params.query });
|
||||||
if let Some(num) = params.num_results {
|
if let Some(num) = params.num_results {
|
||||||
|
|
|
||||||
|
|
@ -160,9 +160,18 @@ pub struct ResearchParams {
|
||||||
pub struct SearchParams {
|
pub struct SearchParams {
|
||||||
/// Search query
|
/// Search query
|
||||||
pub query: String,
|
pub query: String,
|
||||||
/// Number of results to return (default: 10)
|
/// Number of results to return (default: 5, max: 10)
|
||||||
#[serde(default, deserialize_with = "deser_opt_u32_or_str")]
|
#[serde(default, deserialize_with = "deser_opt_u32_or_str")]
|
||||||
pub num_results: Option<u32>,
|
pub num_results: Option<u32>,
|
||||||
|
/// Country code for localization (e.g. "us", "gb", "it").
|
||||||
|
/// Only used by the local Serper path (SERPER_API_KEY).
|
||||||
|
pub country: Option<String>,
|
||||||
|
/// Language code for localization (e.g. "en", "it").
|
||||||
|
/// Only used by the local Serper path (SERPER_API_KEY).
|
||||||
|
pub lang: Option<String>,
|
||||||
|
/// When true, fetch + extract each result page and include its
|
||||||
|
/// markdown. Only used by the local Serper path (SERPER_API_KEY).
|
||||||
|
pub scrape: Option<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parameters for `vertical_scrape`: run a site-specific extractor by name.
|
/// Parameters for `vertical_scrape`: run a site-specific extractor by name.
|
||||||
|
|
|
||||||
|
|
@ -38,16 +38,24 @@ pub enum ApiError {
|
||||||
|
|
||||||
#[error("internal: {0}")]
|
#[error("internal: {0}")]
|
||||||
Internal(String),
|
Internal(String),
|
||||||
|
|
||||||
|
#[error("{0}")]
|
||||||
|
NotImplemented(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ApiError {
|
impl ApiError {
|
||||||
pub fn bad_request(msg: impl Into<String>) -> Self {
|
pub fn bad_request(msg: impl Into<String>) -> Self {
|
||||||
Self::BadRequest(msg.into())
|
Self::BadRequest(msg.into())
|
||||||
}
|
}
|
||||||
#[allow(dead_code)]
|
|
||||||
pub fn internal(msg: impl Into<String>) -> Self {
|
pub fn internal(msg: impl Into<String>) -> Self {
|
||||||
Self::Internal(msg.into())
|
Self::Internal(msg.into())
|
||||||
}
|
}
|
||||||
|
/// 501 — a capability the operator hasn't configured (e.g. search
|
||||||
|
/// without `SERPER_API_KEY`). Distinct from `BadRequest` (client's
|
||||||
|
/// fault) and `Internal` (our fault): it's a deployment-config gap.
|
||||||
|
pub fn not_implemented(msg: impl Into<String>) -> Self {
|
||||||
|
Self::NotImplemented(msg.into())
|
||||||
|
}
|
||||||
|
|
||||||
fn status(&self) -> StatusCode {
|
fn status(&self) -> StatusCode {
|
||||||
match self {
|
match self {
|
||||||
|
|
@ -57,6 +65,7 @@ impl ApiError {
|
||||||
Self::Fetch(_) => StatusCode::BAD_GATEWAY,
|
Self::Fetch(_) => StatusCode::BAD_GATEWAY,
|
||||||
Self::Extract(_) | Self::Llm(_) => StatusCode::UNPROCESSABLE_ENTITY,
|
Self::Extract(_) | Self::Llm(_) => StatusCode::UNPROCESSABLE_ENTITY,
|
||||||
Self::Internal(_) => StatusCode::INTERNAL_SERVER_ERROR,
|
Self::Internal(_) => StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
Self::NotImplemented(_) => StatusCode::NOT_IMPLEMENTED,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -94,6 +94,7 @@ async fn main() -> anyhow::Result<()> {
|
||||||
)
|
)
|
||||||
.route("/crawl", post(routes::crawl::crawl))
|
.route("/crawl", post(routes::crawl::crawl))
|
||||||
.route("/map", post(routes::map::map))
|
.route("/map", post(routes::map::map))
|
||||||
|
.route("/search", post(routes::search::search))
|
||||||
.route("/batch", post(routes::batch::batch))
|
.route("/batch", post(routes::batch::batch))
|
||||||
.route("/extract", post(routes::extract::extract))
|
.route("/extract", post(routes::extract::extract))
|
||||||
.route("/extractors", get(routes::structured::list_extractors))
|
.route("/extractors", get(routes::structured::list_extractors))
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,11 @@
|
||||||
//! (anti-bot bypass with stealth Chrome, JS rendering at scale,
|
//! (anti-bot bypass with stealth Chrome, JS rendering at scale,
|
||||||
//! per-user auth, billing, async job queues, agent loops) are
|
//! per-user auth, billing, async job queues, agent loops) are
|
||||||
//! intentionally not implemented here. Use api.webclaw.io for those.
|
//! intentionally not implemented here. Use api.webclaw.io for those.
|
||||||
|
//!
|
||||||
|
//! `POST /v1/search` is supported when the operator supplies their own
|
||||||
|
//! Serper.dev API key via the `SERPER_API_KEY` env var (free key at
|
||||||
|
//! serper.dev). Without it, the route returns 501. This is the
|
||||||
|
//! bring-your-own-key path — no hosted webclaw account required.
|
||||||
|
|
||||||
pub mod batch;
|
pub mod batch;
|
||||||
pub mod brand;
|
pub mod brand;
|
||||||
|
|
@ -15,5 +20,6 @@ pub mod extract;
|
||||||
pub mod health;
|
pub mod health;
|
||||||
pub mod map;
|
pub mod map;
|
||||||
pub mod scrape;
|
pub mod scrape;
|
||||||
|
pub mod search;
|
||||||
pub mod structured;
|
pub mod structured;
|
||||||
pub mod summarize;
|
pub mod summarize;
|
||||||
|
|
|
||||||
68
crates/webclaw-server/src/routes/search.rs
Normal file
68
crates/webclaw-server/src/routes/search.rs
Normal file
|
|
@ -0,0 +1,68 @@
|
||||||
|
//! POST /v1/search — web search via Serper.dev using the operator's own key.
|
||||||
|
//!
|
||||||
|
//! Enabled only when the server is started with `SERPER_API_KEY` set
|
||||||
|
//! (get a free key at serper.dev). Without it, this route returns 501 so
|
||||||
|
//! self-hosters know the capability exists but isn't configured.
|
||||||
|
//!
|
||||||
|
//! With `scrape: true`, each result page is fetched + extracted to
|
||||||
|
//! markdown via the shared [`webclaw_fetch::FetchClient`]. A per-result
|
||||||
|
//! fetch failure leaves that result's `content` null; it never fails the
|
||||||
|
//! whole search.
|
||||||
|
|
||||||
|
use axum::{Json, extract::State};
|
||||||
|
use serde::Deserialize;
|
||||||
|
use serde_json::{Value, json};
|
||||||
|
|
||||||
|
use crate::{error::ApiError, state::AppState};
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct SearchRequest {
|
||||||
|
pub query: String,
|
||||||
|
/// Max results to return (default 5, clamped to 1..=10).
|
||||||
|
#[serde(default = "default_num_results")]
|
||||||
|
pub num_results: usize,
|
||||||
|
/// Country code for localization (e.g. "us", "gb", "it").
|
||||||
|
pub country: Option<String>,
|
||||||
|
/// Language code for localization (e.g. "en", "it").
|
||||||
|
pub lang: Option<String>,
|
||||||
|
/// When true, fetch + extract each result page and include its markdown.
|
||||||
|
#[serde(default)]
|
||||||
|
pub scrape: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_num_results() -> usize {
|
||||||
|
5
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn search(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
Json(req): Json<SearchRequest>,
|
||||||
|
) -> Result<Json<Value>, ApiError> {
|
||||||
|
if req.query.trim().is_empty() {
|
||||||
|
return Err(ApiError::bad_request("`query` is required"));
|
||||||
|
}
|
||||||
|
|
||||||
|
let serper_key = state.serper_api_key().ok_or_else(|| {
|
||||||
|
ApiError::not_implemented(
|
||||||
|
"search is not configured: start the server with SERPER_API_KEY set \
|
||||||
|
(get a free key at serper.dev)",
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let opts = webclaw_fetch::SearchOptions {
|
||||||
|
num_results: req.num_results,
|
||||||
|
country: req.country.clone(),
|
||||||
|
lang: req.lang.clone(),
|
||||||
|
scrape: req.scrape,
|
||||||
|
};
|
||||||
|
|
||||||
|
let results = webclaw_fetch::search(state.fetch(), serper_key, &req.query, &opts)
|
||||||
|
.await
|
||||||
|
.map_err(|e| ApiError::internal(format!("search failed: {e}")))?;
|
||||||
|
|
||||||
|
Ok(Json(json!({
|
||||||
|
"query": req.query,
|
||||||
|
"count": results.len(),
|
||||||
|
"results": results,
|
||||||
|
})))
|
||||||
|
}
|
||||||
|
|
@ -36,6 +36,9 @@ struct Inner {
|
||||||
pub fetch: Arc<FetchClient>,
|
pub fetch: Arc<FetchClient>,
|
||||||
/// Inbound bearer-auth token for this server's own `/v1/*` surface.
|
/// Inbound bearer-auth token for this server's own `/v1/*` surface.
|
||||||
pub api_key: Option<String>,
|
pub api_key: Option<String>,
|
||||||
|
/// Operator's own Serper.dev API key, read from `SERPER_API_KEY`.
|
||||||
|
/// Enables `/v1/search`. Unset = `/v1/search` returns 501.
|
||||||
|
pub serper_api_key: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AppState {
|
impl AppState {
|
||||||
|
|
@ -66,10 +69,20 @@ impl AppState {
|
||||||
fetch = fetch.with_cloud(cloud);
|
fetch = fetch.with_cloud(cloud);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Operator's own Serper.dev key enables /v1/search. Empty/unset
|
||||||
|
// leaves search returning 501 with a setup hint.
|
||||||
|
let serper_api_key = std::env::var("SERPER_API_KEY")
|
||||||
|
.ok()
|
||||||
|
.filter(|k| !k.trim().is_empty());
|
||||||
|
if serper_api_key.is_some() {
|
||||||
|
info!("search enabled — using SERPER_API_KEY for /v1/search");
|
||||||
|
}
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
inner: Arc::new(Inner {
|
inner: Arc::new(Inner {
|
||||||
fetch: Arc::new(fetch),
|
fetch: Arc::new(fetch),
|
||||||
api_key: inbound_api_key,
|
api_key: inbound_api_key,
|
||||||
|
serper_api_key,
|
||||||
}),
|
}),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
@ -81,6 +94,11 @@ impl AppState {
|
||||||
pub fn api_key(&self) -> Option<&str> {
|
pub fn api_key(&self) -> Option<&str> {
|
||||||
self.inner.api_key.as_deref()
|
self.inner.api_key.as_deref()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Operator's Serper.dev key for `/v1/search`, if configured.
|
||||||
|
pub fn serper_api_key(&self) -> Option<&str> {
|
||||||
|
self.inner.serper_api_key.as_deref()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Resolve the outbound cloud key. Prefers `WEBCLAW_CLOUD_API_KEY`;
|
/// Resolve the outbound cloud key. Prefers `WEBCLAW_CLOUD_API_KEY`;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue