webclaw/crates/webclaw-cli/src/main.rs
Valerio b92c0ed186 style: fix cargo fmt formatting
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 17:27:15 +01:00

1279 lines
39 KiB
Rust

#![allow(dead_code)]
/// CLI entry point -- wires webclaw-core and webclaw-fetch into a single command.
/// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
mod cloud;
use std::io::{self, Read as _};
use std::process;
use std::sync::Arc;
use clap::{Parser, ValueEnum};
use tracing_subscriber::EnvFilter;
use webclaw_core::{
ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options, to_llm_text,
};
use webclaw_fetch::{
BatchExtractResult, BrowserProfile, CrawlConfig, CrawlResult, Crawler, FetchClient,
FetchConfig, FetchResult, SitemapEntry,
};
use webclaw_llm::LlmProvider;
use webclaw_pdf::PdfMode;
/// Known anti-bot challenge page titles (case-insensitive prefix match).
const ANTIBOT_TITLES: &[&str] = &[
"just a moment",
"attention required",
"access denied",
"checking your browser",
"please wait",
"one more step",
"verify you are human",
"bot verification",
"security check",
"ddos protection",
];
/// Detect why a page returned empty content.
enum EmptyReason {
/// Anti-bot challenge page (Cloudflare, Akamai, etc.)
Antibot,
/// JS-only SPA that returns an empty shell without a browser
JsRequired,
/// Page has content — not empty
None,
}
fn detect_empty(result: &ExtractionResult) -> EmptyReason {
// Has real content — nothing to warn about
if result.metadata.word_count > 50 || !result.content.markdown.is_empty() {
return EmptyReason::None;
}
// Check for known anti-bot challenge titles
if let Some(ref title) = result.metadata.title {
let lower = title.to_lowercase();
if ANTIBOT_TITLES.iter().any(|t| lower.starts_with(t)) {
return EmptyReason::Antibot;
}
}
// Empty content with no title or a generic SPA shell = JS-only site
if result.metadata.word_count == 0 && result.content.links.is_empty() {
return EmptyReason::JsRequired;
}
EmptyReason::None
}
fn warn_empty(url: &str, reason: &EmptyReason) {
match reason {
EmptyReason::Antibot => eprintln!(
"\x1b[33mwarning:\x1b[0m Anti-bot protection detected on {url}\n\
This site requires CAPTCHA solving or browser rendering.\n\
Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing"
),
EmptyReason::JsRequired => eprintln!(
"\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\
This site requires JavaScript rendering (SPA).\n\
Use the webclaw Cloud API for JS rendering: https://webclaw.io/pricing"
),
EmptyReason::None => {}
}
}
#[derive(Parser)]
#[command(name = "webclaw", about = "Extract web content for LLMs", version)]
struct Cli {
/// URLs to fetch (multiple allowed)
#[arg()]
urls: Vec<String>,
/// File with URLs (one per line)
#[arg(long)]
urls_file: Option<String>,
/// Output format (markdown, json, text, llm)
#[arg(short, long, default_value = "markdown")]
format: OutputFormat,
/// Browser to impersonate
#[arg(short, long, default_value = "chrome")]
browser: Browser,
/// Proxy URL (http://user:pass@host:port or socks5://host:port)
#[arg(short, long, env = "WEBCLAW_PROXY")]
proxy: Option<String>,
/// File with proxies (host:port:user:pass, one per line). Rotates per request.
#[arg(long, env = "WEBCLAW_PROXY_FILE")]
proxy_file: Option<String>,
/// Request timeout in seconds
#[arg(short, long, default_value = "30")]
timeout: u64,
/// Extract from local HTML file instead of fetching
#[arg(long)]
file: Option<String>,
/// Read HTML from stdin
#[arg(long)]
stdin: bool,
/// Include metadata in output (always included in JSON)
#[arg(long)]
metadata: bool,
/// Output raw fetched HTML instead of extracting
#[arg(long)]
raw_html: bool,
/// CSS selectors to include (comma-separated, e.g. "article,.content")
#[arg(long)]
include: Option<String>,
/// CSS selectors to exclude (comma-separated, e.g. "nav,.sidebar,footer")
#[arg(long)]
exclude: Option<String>,
/// Only extract main content (article/main element)
#[arg(long)]
only_main_content: bool,
/// Custom headers (repeatable, e.g. -H "Cookie: foo=bar")
#[arg(short = 'H', long = "header")]
headers: Vec<String>,
/// Cookie string (shorthand for -H "Cookie: ...")
#[arg(long)]
cookie: Option<String>,
/// Enable verbose logging
#[arg(short, long)]
verbose: bool,
/// Compare against a previous JSON snapshot
#[arg(long)]
diff_with: Option<String>,
/// Extract brand identity (colors, fonts, logo)
#[arg(long)]
brand: bool,
// -- PDF options --
/// PDF extraction mode: auto (error on empty) or fast (return whatever text is found)
#[arg(long, default_value = "auto")]
pdf_mode: PdfModeArg,
// -- Crawl options --
/// Enable recursive crawling of same-domain links
#[arg(long)]
crawl: bool,
/// Max crawl depth [default: 1]
#[arg(long, default_value = "1")]
depth: usize,
/// Max pages to crawl [default: 20]
#[arg(long, default_value = "20")]
max_pages: usize,
/// Max concurrent requests [default: 5]
#[arg(long, default_value = "5")]
concurrency: usize,
/// Delay between requests in ms [default: 100]
#[arg(long, default_value = "100")]
delay: u64,
/// Only crawl URLs matching this path prefix
#[arg(long)]
path_prefix: Option<String>,
/// Glob patterns for crawl URL paths to include (comma-separated, e.g. "/api/*,/guides/**")
#[arg(long)]
include_paths: Option<String>,
/// Glob patterns for crawl URL paths to exclude (comma-separated, e.g. "/changelog/*,/blog/*")
#[arg(long)]
exclude_paths: Option<String>,
/// Seed crawl frontier from sitemap discovery (robots.txt + /sitemap.xml)
#[arg(long)]
sitemap: bool,
/// Discover URLs from sitemap and print them (one per line; JSON array with --format json)
#[arg(long)]
map: bool,
// -- LLM options --
/// Extract structured JSON using LLM (pass a JSON schema string or @file)
#[arg(long)]
extract_json: Option<String>,
/// Extract using natural language prompt
#[arg(long)]
extract_prompt: Option<String>,
/// Summarize content using LLM (optional: number of sentences, default 3)
#[arg(long, num_args = 0..=1, default_missing_value = "3")]
summarize: Option<usize>,
/// Force a specific LLM provider (ollama, openai, anthropic)
#[arg(long, env = "WEBCLAW_LLM_PROVIDER")]
llm_provider: Option<String>,
/// Override the LLM model name
#[arg(long, env = "WEBCLAW_LLM_MODEL")]
llm_model: Option<String>,
/// Override the LLM base URL (Ollama or OpenAI-compatible)
#[arg(long, env = "WEBCLAW_LLM_BASE_URL")]
llm_base_url: Option<String>,
// -- Cloud API options --
/// Webclaw Cloud API key for automatic fallback on bot-protected or JS-rendered sites
#[arg(long, env = "WEBCLAW_API_KEY")]
api_key: Option<String>,
/// Force all requests through the cloud API (skip local extraction)
#[arg(long)]
cloud: bool,
}
#[derive(Clone, ValueEnum)]
enum OutputFormat {
Markdown,
Json,
Text,
Llm,
}
#[derive(Clone, ValueEnum)]
enum Browser {
Chrome,
Firefox,
Random,
}
#[derive(Clone, ValueEnum, Default)]
enum PdfModeArg {
/// Error if PDF has no extractable text (catches scanned PDFs)
#[default]
Auto,
/// Return whatever text is found, even if empty
Fast,
}
impl From<PdfModeArg> for PdfMode {
fn from(arg: PdfModeArg) -> Self {
match arg {
PdfModeArg::Auto => PdfMode::Auto,
PdfModeArg::Fast => PdfMode::Fast,
}
}
}
impl From<Browser> for BrowserProfile {
fn from(b: Browser) -> Self {
match b {
Browser::Chrome => BrowserProfile::Chrome,
Browser::Firefox => BrowserProfile::Firefox,
Browser::Random => BrowserProfile::Random,
}
}
}
fn init_logging(verbose: bool) {
let filter = if verbose {
EnvFilter::new("webclaw=debug")
} else {
EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new("warn"))
};
tracing_subscriber::fmt().with_env_filter(filter).init();
}
/// Build FetchConfig from CLI flags.
///
/// `--proxy` sets a single static proxy (no rotation).
/// `--proxy-file` loads a pool of proxies and rotates per-request.
/// `--proxy` takes priority: if both are set, only the single proxy is used.
fn build_fetch_config(cli: &Cli) -> FetchConfig {
let (proxy, proxy_pool) = if cli.proxy.is_some() {
(cli.proxy.clone(), Vec::new())
} else if let Some(ref path) = cli.proxy_file {
match webclaw_fetch::parse_proxy_file(path) {
Ok(pool) => (None, pool),
Err(e) => {
eprintln!("warning: {e}");
(None, Vec::new())
}
}
} else if std::path::Path::new("proxies.txt").exists() {
// Auto-load proxies.txt from working directory if present
match webclaw_fetch::parse_proxy_file("proxies.txt") {
Ok(pool) if !pool.is_empty() => {
eprintln!("loaded {} proxies from proxies.txt", pool.len());
(None, pool)
}
_ => (None, Vec::new()),
}
} else {
(None, Vec::new())
};
let mut headers = std::collections::HashMap::from([(
"Accept-Language".to_string(),
"en-US,en;q=0.9".to_string(),
)]);
// Parse -H "Key: Value" flags
for h in &cli.headers {
if let Some((key, val)) = h.split_once(':') {
headers.insert(key.trim().to_string(), val.trim().to_string());
}
}
// --cookie shorthand
if let Some(ref cookie) = cli.cookie {
headers.insert("Cookie".to_string(), cookie.clone());
}
FetchConfig {
browser: cli.browser.clone().into(),
proxy,
proxy_pool,
timeout: std::time::Duration::from_secs(cli.timeout),
pdf_mode: cli.pdf_mode.clone().into(),
headers,
..Default::default()
}
}
fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
ExtractionOptions {
include_selectors: cli
.include
.as_deref()
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
.unwrap_or_default(),
exclude_selectors: cli
.exclude
.as_deref()
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
.unwrap_or_default(),
only_main_content: cli.only_main_content,
include_raw_html: cli.raw_html,
}
}
/// Normalize a URL: prepend `https://` if no scheme is present.
fn normalize_url(url: &str) -> String {
let trimmed = url.trim();
if trimmed.contains("://") {
trimmed.to_string()
} else {
format!("https://{trimmed}")
}
}
/// Collect all URLs from positional args + --urls-file, normalizing bare domains.
fn collect_urls(cli: &Cli) -> Result<Vec<String>, String> {
let mut urls: Vec<String> = cli.urls.iter().map(|u| normalize_url(u)).collect();
if let Some(ref path) = cli.urls_file {
let content =
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
for line in content.lines() {
let trimmed = line.trim();
if !trimmed.is_empty() && !trimmed.starts_with('#') {
urls.push(normalize_url(trimmed));
}
}
}
Ok(urls)
}
/// Result that can be either a local extraction or a cloud API JSON response.
enum FetchOutput {
Local(Box<ExtractionResult>),
Cloud(serde_json::Value),
}
impl FetchOutput {
/// Get the local ExtractionResult, or try to parse it from the cloud response.
fn into_extraction(self) -> Result<ExtractionResult, String> {
match self {
FetchOutput::Local(r) => Ok(*r),
FetchOutput::Cloud(resp) => {
// Cloud response has an "extraction" field with the full ExtractionResult
resp.get("extraction")
.and_then(|v| serde_json::from_value(v.clone()).ok())
.or_else(|| serde_json::from_value(resp.clone()).ok())
.ok_or_else(|| "could not parse extraction from cloud response".to_string())
}
}
}
}
/// Fetch a URL and extract content, handling PDF detection automatically.
/// Falls back to cloud API when bot protection or JS rendering is detected.
async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
// Local sources: read and extract as HTML
if cli.stdin {
let mut buf = String::new();
io::stdin()
.read_to_string(&mut buf)
.map_err(|e| format!("failed to read stdin: {e}"))?;
let options = build_extraction_options(cli);
return extract_with_options(&buf, None, &options)
.map(|r| FetchOutput::Local(Box::new(r)))
.map_err(|e| format!("extraction error: {e}"));
}
if let Some(ref path) = cli.file {
let html =
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
let options = build_extraction_options(cli);
return extract_with_options(&html, None, &options)
.map(|r| FetchOutput::Local(Box::new(r)))
.map_err(|e| format!("extraction error: {e}"));
}
let raw_url = cli
.urls
.first()
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
let url = normalize_url(raw_url);
let url = url.as_str();
let cloud_client = cloud::CloudClient::new(cli.api_key.as_deref());
// --cloud: skip local, go straight to cloud API
if cli.cloud {
let c =
cloud_client.ok_or("--cloud requires WEBCLAW_API_KEY (set via env or --api-key)")?;
let options = build_extraction_options(cli);
let format_str = match cli.format {
OutputFormat::Markdown => "markdown",
OutputFormat::Json => "json",
OutputFormat::Text => "text",
OutputFormat::Llm => "llm",
};
let resp = c
.scrape(
url,
&[format_str],
&options.include_selectors,
&options.exclude_selectors,
options.only_main_content,
)
.await?;
return Ok(FetchOutput::Cloud(resp));
}
// Normal path: try local first
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(cli);
let result = client
.fetch_and_extract_with_options(url, &options)
.await
.map_err(|e| format!("fetch error: {e}"))?;
// Check if we should fall back to cloud
let reason = detect_empty(&result);
if !matches!(reason, EmptyReason::None) {
if let Some(ref c) = cloud_client {
eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API...");
let format_str = match cli.format {
OutputFormat::Markdown => "markdown",
OutputFormat::Json => "json",
OutputFormat::Text => "text",
OutputFormat::Llm => "llm",
};
match c
.scrape(
url,
&[format_str],
&options.include_selectors,
&options.exclude_selectors,
options.only_main_content,
)
.await
{
Ok(resp) => return Ok(FetchOutput::Cloud(resp)),
Err(e) => {
eprintln!("\x1b[33mwarning:\x1b[0m cloud fallback failed: {e}");
// Fall through to return the local result with a warning
}
}
}
warn_empty(url, &reason);
}
Ok(FetchOutput::Local(Box::new(result)))
}
/// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction.
async fn fetch_html(cli: &Cli) -> Result<FetchResult, String> {
if cli.stdin {
let mut buf = String::new();
io::stdin()
.read_to_string(&mut buf)
.map_err(|e| format!("failed to read stdin: {e}"))?;
return Ok(FetchResult {
html: buf,
url: String::new(),
status: 200,
headers: Default::default(),
elapsed: Default::default(),
});
}
if let Some(ref path) = cli.file {
let html =
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
return Ok(FetchResult {
html,
url: String::new(),
status: 200,
headers: Default::default(),
elapsed: Default::default(),
});
}
let raw_url = cli
.urls
.first()
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
let url = normalize_url(raw_url);
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
client
.fetch(&url)
.await
.map_err(|e| format!("fetch error: {e}"))
}
/// Fetch external stylesheets referenced in HTML and inject them as `<style>` blocks.
/// This allows brand extraction to see colors/fonts from external CSS files.
async fn enrich_html_with_stylesheets(html: &str, base_url: &str) -> String {
let base = match url::Url::parse(base_url) {
Ok(u) => u,
Err(_) => return html.to_string(),
};
// Extract stylesheet hrefs from <link rel="stylesheet" href="...">
let re = regex::Regex::new(
r#"<link[^>]+rel=["']stylesheet["'][^>]+href=["']([^"']+)["']|<link[^>]+href=["']([^"']+)["'][^>]+rel=["']stylesheet["']"#
).unwrap();
let hrefs: Vec<String> = re
.captures_iter(html)
.filter_map(|cap| {
let href = cap.get(1).or(cap.get(2))?;
Some(
base.join(href.as_str())
.map(|u| u.to_string())
.unwrap_or_else(|_| href.as_str().to_string()),
)
})
.take(10)
.collect();
if hrefs.is_empty() {
return html.to_string();
}
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(5))
.build()
.unwrap_or_default();
let mut extra_css = String::new();
for href in &hrefs {
if let Ok(resp) = client.get(href).send().await
&& resp.status().is_success()
&& let Ok(body) = resp.text().await
&& !body.trim_start().starts_with("<!")
&& body.len() < 2_000_000
{
extra_css.push_str("\n<style>\n");
extra_css.push_str(&body);
extra_css.push_str("\n</style>\n");
}
}
if extra_css.is_empty() {
return html.to_string();
}
if let Some(pos) = html.to_lowercase().find("</head>") {
let mut enriched = String::with_capacity(html.len() + extra_css.len());
enriched.push_str(&html[..pos]);
enriched.push_str(&extra_css);
enriched.push_str(&html[pos..]);
enriched
} else {
format!("{extra_css}{html}")
}
}
fn format_frontmatter(meta: &Metadata) -> String {
let mut lines = vec!["---".to_string()];
if let Some(title) = &meta.title {
lines.push(format!("title: \"{title}\""));
}
if let Some(author) = &meta.author {
lines.push(format!("author: \"{author}\""));
}
if let Some(date) = &meta.published_date {
lines.push(format!("date: \"{date}\""));
}
if let Some(url) = &meta.url {
lines.push(format!("source: \"{url}\""));
}
if meta.word_count > 0 {
lines.push(format!("word_count: {}", meta.word_count));
}
lines.push("---".to_string());
lines.push(String::new()); // blank line after frontmatter
lines.join("\n")
}
fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) {
match format {
OutputFormat::Markdown => {
if show_metadata {
print!("{}", format_frontmatter(&result.metadata));
}
println!("{}", result.content.markdown);
}
OutputFormat::Json => {
// serde_json::to_string_pretty won't fail on our types
println!(
"{}",
serde_json::to_string_pretty(result).expect("serialization failed")
);
}
OutputFormat::Text => {
println!("{}", result.content.plain_text);
}
OutputFormat::Llm => {
println!("{}", to_llm_text(result, result.metadata.url.as_deref()));
}
}
}
/// Print cloud API response in the requested format.
fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
match format {
OutputFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(resp).expect("serialization failed")
);
}
OutputFormat::Markdown => {
// Cloud response has content.markdown
if let Some(md) = resp
.get("content")
.and_then(|c| c.get("markdown"))
.and_then(|m| m.as_str())
{
println!("{md}");
} else if let Some(md) = resp.get("markdown").and_then(|m| m.as_str()) {
println!("{md}");
} else {
println!(
"{}",
serde_json::to_string_pretty(resp).expect("serialization failed")
);
}
}
OutputFormat::Text => {
if let Some(txt) = resp
.get("content")
.and_then(|c| c.get("plain_text"))
.and_then(|t| t.as_str())
{
println!("{txt}");
} else {
// Fallback to markdown or raw JSON
print_cloud_output(resp, &OutputFormat::Markdown);
}
}
OutputFormat::Llm => {
if let Some(llm) = resp
.get("content")
.and_then(|c| c.get("llm_text"))
.and_then(|t| t.as_str())
{
println!("{llm}");
} else {
print_cloud_output(resp, &OutputFormat::Markdown);
}
}
}
}
fn print_diff_output(diff: &ContentDiff, format: &OutputFormat) {
match format {
OutputFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(diff).expect("serialization failed")
);
}
// For markdown/text/llm, show a human-readable summary
_ => {
println!("Status: {:?}", diff.status);
println!("Word count delta: {:+}", diff.word_count_delta);
if !diff.metadata_changes.is_empty() {
println!("\nMetadata changes:");
for change in &diff.metadata_changes {
println!(
" {}: {} -> {}",
change.field,
change.old.as_deref().unwrap_or("(none)"),
change.new.as_deref().unwrap_or("(none)"),
);
}
}
if !diff.links_added.is_empty() {
println!("\nLinks added:");
for link in &diff.links_added {
println!(" + {} ({})", link.href, link.text);
}
}
if !diff.links_removed.is_empty() {
println!("\nLinks removed:");
for link in &diff.links_removed {
println!(" - {} ({})", link.href, link.text);
}
}
if let Some(ref text_diff) = diff.text_diff {
println!("\n{text_diff}");
}
}
}
}
fn print_crawl_output(result: &CrawlResult, format: &OutputFormat, show_metadata: bool) {
match format {
OutputFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(result).expect("serialization failed")
);
}
OutputFormat::Markdown => {
for page in &result.pages {
let Some(ref extraction) = page.extraction else {
continue;
};
println!("---");
println!("# Page: {}\n", page.url);
if show_metadata {
print!("{}", format_frontmatter(&extraction.metadata));
}
println!("{}", extraction.content.markdown);
println!();
}
}
OutputFormat::Text => {
for page in &result.pages {
let Some(ref extraction) = page.extraction else {
continue;
};
println!("---");
println!("# Page: {}\n", page.url);
println!("{}", extraction.content.plain_text);
println!();
}
}
OutputFormat::Llm => {
for page in &result.pages {
let Some(ref extraction) = page.extraction else {
continue;
};
println!("---");
println!("{}", to_llm_text(extraction, Some(page.url.as_str())));
println!();
}
}
}
}
fn print_batch_output(results: &[BatchExtractResult], format: &OutputFormat, show_metadata: bool) {
match format {
OutputFormat::Json => {
// Build a JSON array of {url, result?, error?} objects
let entries: Vec<serde_json::Value> = results
.iter()
.map(|r| match &r.result {
Ok(extraction) => serde_json::json!({
"url": r.url,
"result": extraction,
}),
Err(e) => serde_json::json!({
"url": r.url,
"error": e.to_string(),
}),
})
.collect();
println!(
"{}",
serde_json::to_string_pretty(&entries).expect("serialization failed")
);
}
OutputFormat::Markdown => {
for r in results {
match &r.result {
Ok(extraction) => {
println!("---");
println!("# {}\n", r.url);
if show_metadata {
print!("{}", format_frontmatter(&extraction.metadata));
}
println!("{}", extraction.content.markdown);
println!();
}
Err(e) => {
eprintln!("error: {} -- {}", r.url, e);
}
}
}
}
OutputFormat::Text => {
for r in results {
match &r.result {
Ok(extraction) => {
println!("---");
println!("# {}\n", r.url);
println!("{}", extraction.content.plain_text);
println!();
}
Err(e) => {
eprintln!("error: {} -- {}", r.url, e);
}
}
}
}
OutputFormat::Llm => {
for r in results {
match &r.result {
Ok(extraction) => {
println!("---");
println!("{}", to_llm_text(extraction, Some(r.url.as_str())));
println!();
}
Err(e) => {
eprintln!("error: {} -- {}", r.url, e);
}
}
}
}
}
}
fn print_map_output(entries: &[SitemapEntry], format: &OutputFormat) {
match format {
OutputFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(entries).expect("serialization failed")
);
}
_ => {
for entry in entries {
println!("{}", entry.url);
}
}
}
}
async fn run_crawl(cli: &Cli) -> Result<(), String> {
let url = cli
.urls
.first()
.ok_or("--crawl requires a URL argument")
.map(|u| normalize_url(u))?;
let url = url.as_str();
if cli.file.is_some() || cli.stdin {
return Err("--crawl cannot be used with --file or --stdin".into());
}
let include_patterns: Vec<String> = cli
.include_paths
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default();
let exclude_patterns: Vec<String> = cli
.exclude_paths
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default();
let config = CrawlConfig {
fetch: build_fetch_config(cli),
max_depth: cli.depth,
max_pages: cli.max_pages,
concurrency: cli.concurrency,
delay: std::time::Duration::from_millis(cli.delay),
path_prefix: cli.path_prefix.clone(),
use_sitemap: cli.sitemap,
include_patterns,
exclude_patterns,
progress_tx: None,
};
let crawler = Crawler::new(url, config).map_err(|e| format!("crawler error: {e}"))?;
let result = crawler.crawl(url).await;
// Log per-page errors and extraction warnings to stderr
for page in &result.pages {
if let Some(ref err) = page.error {
eprintln!("error: {} -- {}", page.url, err);
} else if let Some(ref extraction) = page.extraction {
let reason = detect_empty(extraction);
if !matches!(reason, EmptyReason::None) {
warn_empty(&page.url, &reason);
}
}
}
print_crawl_output(&result, &cli.format, cli.metadata);
eprintln!(
"Crawled {} pages ({} ok, {} errors) in {:.1}s",
result.total, result.ok, result.errors, result.elapsed_secs,
);
if result.errors > 0 {
Err(format!(
"{} of {} pages failed",
result.errors, result.total
))
} else {
Ok(())
}
}
async fn run_map(cli: &Cli) -> Result<(), String> {
let url = cli
.urls
.first()
.ok_or("--map requires a URL argument")
.map(|u| normalize_url(u))?;
let url = url.as_str();
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let entries = webclaw_fetch::sitemap::discover(&client, url)
.await
.map_err(|e| format!("sitemap discovery failed: {e}"))?;
if entries.is_empty() {
eprintln!("no sitemap URLs found for {url}");
} else {
eprintln!("discovered {} URLs", entries.len());
}
print_map_output(&entries, &cli.format);
Ok(())
}
async fn run_batch(cli: &Cli, urls: &[String]) -> Result<(), String> {
let client = Arc::new(
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
);
let url_refs: Vec<&str> = urls.iter().map(String::as_str).collect();
let results = client
.fetch_and_extract_batch(&url_refs, cli.concurrency)
.await;
let ok = results.iter().filter(|r| r.result.is_ok()).count();
let errors = results.len() - ok;
// Log errors and extraction warnings to stderr
for r in &results {
if let Err(ref e) = r.result {
eprintln!("error: {} -- {}", r.url, e);
} else if let Ok(ref extraction) = r.result {
let reason = detect_empty(extraction);
if !matches!(reason, EmptyReason::None) {
warn_empty(&r.url, &reason);
}
}
}
print_batch_output(&results, &cli.format, cli.metadata);
eprintln!(
"Fetched {} URLs ({} ok, {} errors)",
results.len(),
ok,
errors
);
if errors > 0 {
Err(format!("{errors} of {} URLs failed", results.len()))
} else {
Ok(())
}
}
async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
// Load previous snapshot
let snapshot_json = std::fs::read_to_string(snapshot_path)
.map_err(|e| format!("failed to read snapshot {snapshot_path}: {e}"))?;
let old: ExtractionResult = serde_json::from_str(&snapshot_json)
.map_err(|e| format!("failed to parse snapshot JSON: {e}"))?;
// Extract current version (handles PDF detection for URLs)
let new_result = fetch_and_extract(cli).await?.into_extraction()?;
let diff = webclaw_core::diff::diff(&old, &new_result);
print_diff_output(&diff, &cli.format);
Ok(())
}
async fn run_brand(cli: &Cli) -> Result<(), String> {
let result = fetch_html(cli).await?;
let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await;
let brand = webclaw_core::brand::extract_brand(
&enriched,
Some(result.url.as_str()).filter(|s| !s.is_empty()),
);
println!(
"{}",
serde_json::to_string_pretty(&brand).expect("serialization failed")
);
Ok(())
}
/// Build an LLM provider based on CLI flags, or fall back to the default chain.
async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
if let Some(ref name) = cli.llm_provider {
match name.as_str() {
"ollama" => {
let provider = webclaw_llm::providers::ollama::OllamaProvider::new(
cli.llm_base_url.clone(),
cli.llm_model.clone(),
);
if !provider.is_available().await {
return Err("ollama is not running or unreachable".into());
}
Ok(Box::new(provider))
}
"openai" => {
let provider = webclaw_llm::providers::openai::OpenAiProvider::new(
None,
cli.llm_base_url.clone(),
cli.llm_model.clone(),
)
.ok_or("OPENAI_API_KEY not set")?;
Ok(Box::new(provider))
}
"anthropic" => {
let provider = webclaw_llm::providers::anthropic::AnthropicProvider::new(
None,
cli.llm_model.clone(),
)
.ok_or("ANTHROPIC_API_KEY not set")?;
Ok(Box::new(provider))
}
other => Err(format!(
"unknown LLM provider: {other} (use ollama, openai, or anthropic)"
)),
}
} else {
let chain = webclaw_llm::ProviderChain::default().await;
if chain.is_empty() {
return Err(
"no LLM providers available -- start Ollama or set OPENAI_API_KEY / ANTHROPIC_API_KEY"
.into(),
);
}
Ok(Box::new(chain))
}
}
async fn run_llm(cli: &Cli) -> Result<(), String> {
// Extract content from source first (handles PDF detection for URLs)
let result = fetch_and_extract(cli).await?.into_extraction()?;
let provider = build_llm_provider(cli).await?;
let model = cli.llm_model.as_deref();
if let Some(ref schema_input) = cli.extract_json {
// Support @file syntax for loading schema from file
let schema_str = if let Some(path) = schema_input.strip_prefix('@') {
std::fs::read_to_string(path)
.map_err(|e| format!("failed to read schema file {path}: {e}"))?
} else {
schema_input.clone()
};
let schema: serde_json::Value =
serde_json::from_str(&schema_str).map_err(|e| format!("invalid JSON schema: {e}"))?;
let extracted = webclaw_llm::extract::extract_json(
&result.content.plain_text,
&schema,
provider.as_ref(),
model,
)
.await
.map_err(|e| format!("LLM extraction failed: {e}"))?;
println!(
"{}",
serde_json::to_string_pretty(&extracted).expect("serialization failed")
);
} else if let Some(ref prompt) = cli.extract_prompt {
let extracted = webclaw_llm::extract::extract_with_prompt(
&result.content.plain_text,
prompt,
provider.as_ref(),
model,
)
.await
.map_err(|e| format!("LLM extraction failed: {e}"))?;
println!(
"{}",
serde_json::to_string_pretty(&extracted).expect("serialization failed")
);
} else if let Some(sentences) = cli.summarize {
let summary = webclaw_llm::summarize::summarize(
&result.content.plain_text,
Some(sentences),
provider.as_ref(),
model,
)
.await
.map_err(|e| format!("LLM summarization failed: {e}"))?;
println!("{summary}");
}
Ok(())
}
/// Returns true if any LLM flag is set.
fn has_llm_flags(cli: &Cli) -> bool {
cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some()
}
#[tokio::main]
async fn main() {
dotenvy::dotenv().ok();
let cli = Cli::parse();
init_logging(cli.verbose);
// --map: sitemap discovery mode
if cli.map {
if let Err(e) = run_map(&cli).await {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
// --crawl: recursive crawl mode
if cli.crawl {
if let Err(e) = run_crawl(&cli).await {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
// --diff-with: change tracking mode
if let Some(ref snapshot_path) = cli.diff_with {
if let Err(e) = run_diff(&cli, snapshot_path).await {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
// --brand: brand identity extraction mode
if cli.brand {
if let Err(e) = run_brand(&cli).await {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
// LLM modes: --extract-json, --extract-prompt, --summarize
if has_llm_flags(&cli) {
if let Err(e) = run_llm(&cli).await {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
// Collect all URLs from args + --urls-file
let urls = match collect_urls(&cli) {
Ok(u) => u,
Err(e) => {
eprintln!("error: {e}");
process::exit(1);
}
};
// Multi-URL batch mode
if urls.len() > 1 {
if let Err(e) = run_batch(&cli, &urls).await {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
// --raw-html: skip extraction, dump the fetched HTML
if cli.raw_html && cli.include.is_none() && cli.exclude.is_none() {
match fetch_html(&cli).await {
Ok(r) => println!("{}", r.html),
Err(e) => {
eprintln!("error: {e}");
process::exit(1);
}
}
return;
}
// Single-page extraction (handles both HTML and PDF via content-type detection)
match fetch_and_extract(&cli).await {
Ok(FetchOutput::Local(result)) => {
print_output(&result, &cli.format, cli.metadata);
}
Ok(FetchOutput::Cloud(resp)) => {
print_cloud_output(&resp, &cli.format);
}
Err(e) => {
eprintln!("{e}");
process::exit(1);
}
}
}