mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-28 03:29:38 +02:00
feat: CLI --research flag + MCP cloud fallback + structured research output
- --research "query": deep research via cloud API, saves JSON file with report + sources + findings, prints report to stdout - --deep: longer, more thorough research mode - MCP extract/summarize: cloud fallback when no local LLM available - MCP research: returns structured JSON instead of raw text - Bump to v0.3.7 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
344eea74d9
commit
f7cc0cc5cf
5 changed files with 238 additions and 31 deletions
|
|
@ -3,6 +3,15 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [0.3.7] — 2026-04-03
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **`--research` CLI flag**: run deep research via the cloud API. Prints report to stdout and saves full result (report + sources + findings) to a JSON file. Supports `--deep` for longer reports.
|
||||||
|
- **MCP extract/summarize cloud fallback**: when no local LLM is available, these tools now fall back to the cloud API instead of erroring. Set `WEBCLAW_API_KEY` for automatic fallback.
|
||||||
|
- **MCP research structured output**: the research tool now returns structured JSON (report + sources + findings + metadata) instead of raw text, so agents can reference individual findings and source URLs.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.3.6] — 2026-04-02
|
## [0.3.6] — 2026-04-02
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
|
||||||
12
Cargo.lock
generated
12
Cargo.lock
generated
|
|
@ -3055,7 +3055,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.3.6"
|
version = "0.3.7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3075,7 +3075,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.3.6"
|
version = "0.3.7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -3093,7 +3093,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.3.6"
|
version = "0.3.7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"calamine",
|
"calamine",
|
||||||
|
|
@ -3115,7 +3115,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.3.6"
|
version = "0.3.7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3128,7 +3128,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.3.6"
|
version = "0.3.7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3148,7 +3148,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.3.6"
|
version = "0.3.7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
||||||
members = ["crates/*"]
|
members = ["crates/*"]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "0.3.6"
|
version = "0.3.7"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "AGPL-3.0"
|
license = "AGPL-3.0"
|
||||||
repository = "https://github.com/0xMassi/webclaw"
|
repository = "https://github.com/0xMassi/webclaw"
|
||||||
|
|
|
||||||
|
|
@ -268,6 +268,15 @@ struct Cli {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
cloud: bool,
|
cloud: bool,
|
||||||
|
|
||||||
|
/// Run deep research on a topic via the cloud API. Requires --api-key.
|
||||||
|
/// Saves full result (report + sources + findings) to a JSON file.
|
||||||
|
#[arg(long)]
|
||||||
|
research: Option<String>,
|
||||||
|
|
||||||
|
/// Enable deep research mode (longer, more thorough report). Used with --research.
|
||||||
|
#[arg(long)]
|
||||||
|
deep: bool,
|
||||||
|
|
||||||
/// Output directory: save each page to a separate file instead of stdout.
|
/// Output directory: save each page to a separate file instead of stdout.
|
||||||
/// Works with --crawl, batch (multiple URLs), and single URL mode.
|
/// Works with --crawl, batch (multiple URLs), and single URL mode.
|
||||||
/// Filenames are derived from URL paths (e.g. /docs/api -> docs/api.md).
|
/// Filenames are derived from URL paths (e.g. /docs/api -> docs/api.md).
|
||||||
|
|
@ -2067,6 +2076,141 @@ fn has_llm_flags(cli: &Cli) -> bool {
|
||||||
cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some()
|
cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
||||||
|
let api_key = cli
|
||||||
|
.api_key
|
||||||
|
.as_deref()
|
||||||
|
.ok_or("--research requires WEBCLAW_API_KEY (set via env or --api-key)")?;
|
||||||
|
|
||||||
|
let client = reqwest::Client::builder()
|
||||||
|
.timeout(std::time::Duration::from_secs(600))
|
||||||
|
.build()
|
||||||
|
.map_err(|e| format!("http client error: {e}"))?;
|
||||||
|
|
||||||
|
let mut body = serde_json::json!({ "query": query });
|
||||||
|
if cli.deep {
|
||||||
|
body["deep"] = serde_json::json!(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
eprintln!("Starting research: {query}");
|
||||||
|
if cli.deep {
|
||||||
|
eprintln!("Deep mode enabled (longer, more thorough)");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start job
|
||||||
|
let resp = client
|
||||||
|
.post("https://api.webclaw.io/v1/research")
|
||||||
|
.header("Authorization", format!("Bearer {api_key}"))
|
||||||
|
.json(&body)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("API error: {e}"))?
|
||||||
|
.json::<serde_json::Value>()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("parse error: {e}"))?;
|
||||||
|
|
||||||
|
let job_id = resp
|
||||||
|
.get("id")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.ok_or("API did not return a job ID")?
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
eprintln!("Job started: {job_id}");
|
||||||
|
|
||||||
|
// Poll
|
||||||
|
for poll in 0..200 {
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
|
||||||
|
|
||||||
|
let status_resp = client
|
||||||
|
.get(format!("https://api.webclaw.io/v1/research/{job_id}"))
|
||||||
|
.header("Authorization", format!("Bearer {api_key}"))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("poll error: {e}"))?
|
||||||
|
.json::<serde_json::Value>()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("parse error: {e}"))?;
|
||||||
|
|
||||||
|
let status = status_resp
|
||||||
|
.get("status")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("unknown");
|
||||||
|
|
||||||
|
match status {
|
||||||
|
"completed" => {
|
||||||
|
let report = status_resp
|
||||||
|
.get("report")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("");
|
||||||
|
|
||||||
|
// Save full result to JSON file
|
||||||
|
let slug: String = query
|
||||||
|
.chars()
|
||||||
|
.map(|c| {
|
||||||
|
if c.is_alphanumeric() || c == ' ' {
|
||||||
|
c
|
||||||
|
} else {
|
||||||
|
' '
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<String>()
|
||||||
|
.split_whitespace()
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("-")
|
||||||
|
.to_lowercase();
|
||||||
|
let slug = if slug.len() > 50 { &slug[..50] } else { &slug };
|
||||||
|
let filename = format!("research-{slug}.json");
|
||||||
|
|
||||||
|
let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default();
|
||||||
|
std::fs::write(&filename, &json)
|
||||||
|
.map_err(|e| format!("failed to write {filename}: {e}"))?;
|
||||||
|
|
||||||
|
let elapsed = status_resp
|
||||||
|
.get("elapsed_ms")
|
||||||
|
.and_then(|v| v.as_i64())
|
||||||
|
.unwrap_or(0);
|
||||||
|
let sources = status_resp
|
||||||
|
.get("sources_count")
|
||||||
|
.and_then(|v| v.as_i64())
|
||||||
|
.unwrap_or(0);
|
||||||
|
let findings = status_resp
|
||||||
|
.get("findings_count")
|
||||||
|
.and_then(|v| v.as_i64())
|
||||||
|
.unwrap_or(0);
|
||||||
|
|
||||||
|
eprintln!(
|
||||||
|
"Research complete: {sources} sources, {findings} findings, {:.1}s",
|
||||||
|
elapsed as f64 / 1000.0
|
||||||
|
);
|
||||||
|
eprintln!("Saved to: {filename}");
|
||||||
|
|
||||||
|
// Print report to stdout
|
||||||
|
if !report.is_empty() {
|
||||||
|
println!("{report}");
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
"failed" => {
|
||||||
|
let error = status_resp
|
||||||
|
.get("error")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("unknown error");
|
||||||
|
return Err(format!("Research failed: {error}"));
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
if poll % 10 == 9 {
|
||||||
|
eprintln!("Still researching... ({:.0}s)", (poll + 1) as f64 * 3.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(format!(
|
||||||
|
"Research timed out after ~10 minutes. Check status: GET /v1/research/{job_id}"
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
dotenvy::dotenv().ok();
|
dotenvy::dotenv().ok();
|
||||||
|
|
@ -2126,6 +2270,15 @@ async fn main() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --research: deep research via cloud API
|
||||||
|
if let Some(ref query) = cli.research {
|
||||||
|
if let Err(e) = run_research(&cli, query).await {
|
||||||
|
eprintln!("error: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Collect all URLs from args + --urls-file
|
// Collect all URLs from args + --urls-file
|
||||||
let entries = match collect_urls(&cli) {
|
let entries = match collect_urls(&cli) {
|
||||||
Ok(u) => u,
|
Ok(u) => u,
|
||||||
|
|
|
||||||
|
|
@ -319,34 +319,46 @@ impl WebclawMcp {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract structured data from a web page using an LLM. Provide either a JSON schema or a natural language prompt.
|
/// Extract structured data from a web page using an LLM. Provide either a JSON schema or a natural language prompt.
|
||||||
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
|
/// Falls back to the webclaw cloud API when no local LLM is available or bot protection is detected.
|
||||||
#[tool]
|
#[tool]
|
||||||
async fn extract(
|
async fn extract(
|
||||||
&self,
|
&self,
|
||||||
Parameters(params): Parameters<ExtractParams>,
|
Parameters(params): Parameters<ExtractParams>,
|
||||||
) -> Result<String, String> {
|
) -> Result<String, String> {
|
||||||
validate_url(¶ms.url)?;
|
validate_url(¶ms.url)?;
|
||||||
let chain = self.llm_chain.as_ref().ok_or(
|
|
||||||
"No LLM providers available. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run Ollama locally.",
|
|
||||||
)?;
|
|
||||||
|
|
||||||
if params.schema.is_none() && params.prompt.is_none() {
|
if params.schema.is_none() && params.prompt.is_none() {
|
||||||
return Err("Either 'schema' or 'prompt' is required for extraction.".into());
|
return Err("Either 'schema' or 'prompt' is required for extraction.".into());
|
||||||
}
|
}
|
||||||
|
|
||||||
// For extract, if we get a cloud fallback we call the cloud extract endpoint directly
|
// No local LLM — fall back to cloud API directly
|
||||||
|
if self.llm_chain.is_none() {
|
||||||
|
let cloud = self.cloud.as_ref().ok_or(
|
||||||
|
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or WEBCLAW_API_KEY for cloud fallback.",
|
||||||
|
)?;
|
||||||
|
let mut body = json!({"url": params.url});
|
||||||
|
if let Some(ref schema) = params.schema {
|
||||||
|
body["schema"] = json!(schema);
|
||||||
|
}
|
||||||
|
if let Some(ref prompt) = params.prompt {
|
||||||
|
body["prompt"] = json!(prompt);
|
||||||
|
}
|
||||||
|
let resp = cloud.post("extract", body).await?;
|
||||||
|
return Ok(serde_json::to_string_pretty(&resp).unwrap_or_default());
|
||||||
|
}
|
||||||
|
|
||||||
|
let chain = self.llm_chain.as_ref().unwrap();
|
||||||
|
|
||||||
let llm_content = match self.smart_fetch_llm(¶ms.url).await? {
|
let llm_content = match self.smart_fetch_llm(¶ms.url).await? {
|
||||||
SmartFetchResult::Local(extraction) => {
|
SmartFetchResult::Local(extraction) => {
|
||||||
webclaw_core::to_llm_text(&extraction, Some(¶ms.url))
|
webclaw_core::to_llm_text(&extraction, Some(¶ms.url))
|
||||||
}
|
}
|
||||||
SmartFetchResult::Cloud(resp) => {
|
SmartFetchResult::Cloud(resp) => resp
|
||||||
// Use the LLM format from cloud, fall back to markdown
|
.get("llm")
|
||||||
resp.get("llm")
|
.or_else(|| resp.get("markdown"))
|
||||||
.or_else(|| resp.get("markdown"))
|
.and_then(|v| v.as_str())
|
||||||
.and_then(|v| v.as_str())
|
.unwrap_or("")
|
||||||
.unwrap_or("")
|
.to_string(),
|
||||||
.to_string()
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let data = if let Some(ref schema) = params.schema {
|
let data = if let Some(ref schema) = params.schema {
|
||||||
|
|
@ -364,16 +376,32 @@ impl WebclawMcp {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize the content of a web page using an LLM.
|
/// Summarize the content of a web page using an LLM.
|
||||||
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
|
/// Falls back to the webclaw cloud API when no local LLM is available or bot protection is detected.
|
||||||
#[tool]
|
#[tool]
|
||||||
async fn summarize(
|
async fn summarize(
|
||||||
&self,
|
&self,
|
||||||
Parameters(params): Parameters<SummarizeParams>,
|
Parameters(params): Parameters<SummarizeParams>,
|
||||||
) -> Result<String, String> {
|
) -> Result<String, String> {
|
||||||
validate_url(¶ms.url)?;
|
validate_url(¶ms.url)?;
|
||||||
let chain = self.llm_chain.as_ref().ok_or(
|
|
||||||
"No LLM providers available. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run Ollama locally.",
|
// No local LLM — fall back to cloud API directly
|
||||||
)?;
|
if self.llm_chain.is_none() {
|
||||||
|
let cloud = self.cloud.as_ref().ok_or(
|
||||||
|
"No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or WEBCLAW_API_KEY for cloud fallback.",
|
||||||
|
)?;
|
||||||
|
let mut body = json!({"url": params.url});
|
||||||
|
if let Some(sentences) = params.max_sentences {
|
||||||
|
body["max_sentences"] = json!(sentences);
|
||||||
|
}
|
||||||
|
let resp = cloud.post("summarize", body).await?;
|
||||||
|
let summary = resp.get("summary").and_then(|v| v.as_str()).unwrap_or("");
|
||||||
|
if summary.is_empty() {
|
||||||
|
return Ok(serde_json::to_string_pretty(&resp).unwrap_or_default());
|
||||||
|
}
|
||||||
|
return Ok(summary.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let chain = self.llm_chain.as_ref().unwrap();
|
||||||
|
|
||||||
let llm_content = match self.smart_fetch_llm(¶ms.url).await? {
|
let llm_content = match self.smart_fetch_llm(¶ms.url).await? {
|
||||||
SmartFetchResult::Local(extraction) => {
|
SmartFetchResult::Local(extraction) => {
|
||||||
|
|
@ -535,15 +563,32 @@ impl WebclawMcp {
|
||||||
|
|
||||||
match status {
|
match status {
|
||||||
"completed" => {
|
"completed" => {
|
||||||
let report = status_resp
|
// Return structured result: report + sources + findings
|
||||||
.get("report")
|
let mut result = json!({
|
||||||
.and_then(|v| v.as_str())
|
"id": job_id,
|
||||||
.unwrap_or("");
|
"status": "completed",
|
||||||
|
});
|
||||||
|
|
||||||
if report.is_empty() {
|
if let Some(report) = status_resp.get("report") {
|
||||||
return Ok(serde_json::to_string_pretty(&status_resp).unwrap_or_default());
|
result["report"] = report.clone();
|
||||||
}
|
}
|
||||||
return Ok(report.to_string());
|
if let Some(sources) = status_resp.get("sources") {
|
||||||
|
result["sources"] = sources.clone();
|
||||||
|
}
|
||||||
|
if let Some(findings) = status_resp.get("findings") {
|
||||||
|
result["findings"] = findings.clone();
|
||||||
|
}
|
||||||
|
if let Some(elapsed) = status_resp.get("elapsed_ms") {
|
||||||
|
result["elapsed_ms"] = elapsed.clone();
|
||||||
|
}
|
||||||
|
if let Some(sc) = status_resp.get("sources_count") {
|
||||||
|
result["sources_count"] = sc.clone();
|
||||||
|
}
|
||||||
|
if let Some(fc) = status_resp.get("findings_count") {
|
||||||
|
result["findings_count"] = fc.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok(serde_json::to_string_pretty(&result).unwrap_or_default());
|
||||||
}
|
}
|
||||||
"failed" => {
|
"failed" => {
|
||||||
let error = status_resp
|
let error = status_resp
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue