perf(core): hot-path extraction speedups + senior-grade hardening

Extraction ~22% faster on the corpus benchmark with byte-identical output:
- hoist recompiled CSS selectors in the markdown noise path
- single-pass shared og() meta parsing across vertical extractors
- output-safe QuickJS gating (skip the JS VM when no candidate data) +
  reuse the already-parsed document instead of re-parsing
- wreq connect_timeout + connection-pool tuning; dedup the retry loop

Reliability + correctness:
- char-boundary-safe truncation of LLM error bodies (shared helper)
- HTTP connect/read timeouts on all LLM provider clients
- isolate pdf-extract behind catch_unwind + spawn_blocking
- OSS server: crawl inherits the shared fetch profile; ProviderChain built
  once in AppState; request TimeoutLayer

API / safety / docs:
- #[non_exhaustive] on public enums + result structs (+ builders)
- #![forbid(unsafe_code)] on pure crates, deny on llm
- //! crate docs + doctests; scrub bypass/vendor/target specifics from
  public crate docs and comments

Tooling: [profile.release] lto/codegen-units/strip, MSRV pin, deny.toml +
cargo-deny CI, macOS test matrix. CLI main.rs split into focused modules.
This commit is contained in:
webclaw 2026-06-04 20:22:00 +02:00
parent e499e51e70
commit 02302e7a1d
62 changed files with 3761 additions and 3130 deletions

View file

@ -12,7 +12,11 @@ env:
jobs:
test:
name: Test
runs-on: ubuntu-latest
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
steps:
- uses: actions/checkout@v5
- uses: dtolnay/rust-toolchain@stable
@ -29,7 +33,16 @@ jobs:
components: clippy, rustfmt
- uses: Swatinem/rust-cache@v2
- run: cargo fmt --check --all
- run: cargo clippy --all -- -D warnings
- run: cargo clippy --all --all-targets -- -D warnings
deny:
name: Supply chain
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: EmbarkStudios/cargo-deny-action@v2
with:
command: check advisories bans licenses sources
wasm:
name: WASM

2
Cargo.lock generated
View file

@ -3331,11 +3331,13 @@ dependencies = [
"anyhow",
"axum",
"clap",
"http-body-util",
"serde",
"serde_json",
"subtle",
"thiserror",
"tokio",
"tower",
"tower-http",
"tracing",
"tracing-subscriber",

View file

@ -5,9 +5,31 @@ members = ["crates/*"]
[workspace.package]
version = "0.6.5"
edition = "2024"
rust-version = "1.85"
license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw"
# Hardened release profile: thin LTO + a single codegen unit enable
# cross-crate inlining on the hot extraction path and shrink the binaries,
# and stripping symbols trims the shipped artifact. We deliberately do NOT
# set `panic = "abort"`: webclaw-pdf relies on std::panic::catch_unwind to
# recover from panics inside the pdf-extract parser, and abort would turn
# those recoverable panics into hard process kills.
[profile.release]
lto = "thin"
codegen-units = 1
strip = true
# Conservative, high-value hardening lints applied workspace-wide. Crates
# opt in via `[lints] workspace = true`. Kept deliberately narrow so
# `clippy -D warnings` stays green — the goal is hardening, not a cleanup
# sweep that would break the build.
[workspace.lints.rust]
unsafe_op_in_unsafe_fn = "warn"
[workspace.lints.clippy]
mem_forget = "warn"
[workspace.dependencies]
webclaw-core = { path = "crates/webclaw-core" }
webclaw-fetch = { path = "crates/webclaw-fetch" }

View file

@ -3,8 +3,12 @@ name = "webclaw-cli"
description = "CLI for extracting web content into LLM-optimized formats"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
[lints]
workspace = true
[[bin]]
name = "webclaw"
path = "src/main.rs"

View file

@ -48,7 +48,10 @@ async fn main() {
match mode.as_str() {
"capture" => capture().await,
"bench" => {
let iters: usize = std::env::args().nth(2).and_then(|s| s.parse().ok()).unwrap_or(60);
let iters: usize = std::env::args()
.nth(2)
.and_then(|s| s.parse().ok())
.unwrap_or(60);
bench(iters);
}
"snapshot" => {
@ -64,14 +67,21 @@ async fn main() {
async fn capture() {
fs::create_dir_all(CORPUS).unwrap();
let config = FetchConfig { browser: BrowserProfile::Chrome, ..FetchConfig::default() };
let config = FetchConfig {
browser: BrowserProfile::Chrome,
..FetchConfig::default()
};
let client = FetchClient::new(config).expect("build client");
let mut ok = 0;
for (i, u) in URLS.iter().enumerate() {
let name = format!(
"{:02}_{}.html",
i + 1,
u.replace("https://", "").chars().map(|c| if c.is_alphanumeric() { c } else { '_' }).take(40).collect::<String>()
u.replace("https://", "")
.chars()
.map(|c| if c.is_alphanumeric() { c } else { '_' })
.take(40)
.collect::<String>()
);
match client.fetch(u).await {
Ok(f) if f.html.len() > 1000 => {
@ -99,7 +109,9 @@ fn snapshot(label: &str) {
let mut n = 0;
for path in &files {
let html = fs::read_to_string(path).unwrap_or_default();
if html.is_empty() { continue; }
if html.is_empty() {
continue;
}
let stem = path.file_stem().unwrap().to_string_lossy().to_string();
let url = format!("https://corpus/{stem}");
match extract(&html, Some(&url)) {
@ -117,7 +129,9 @@ fn snapshot(label: &str) {
}
fn percentile(sorted: &[u128], p: f64) -> u128 {
if sorted.is_empty() { return 0; }
if sorted.is_empty() {
return 0;
}
let idx = ((sorted.len() as f64 - 1.0) * p).round() as usize;
sorted[idx]
}
@ -135,7 +149,10 @@ fn bench(iters: usize) {
}
println!("# perf_corpus bench docs={} iters={}", files.len(), iters);
println!("{:<42} {:>10} {:>10} {:>10} {:>10}", "doc(KB)", "extract_us", "llm_us", "p50_us", "p90_us");
println!(
"{:<42} {:>10} {:>10} {:>10} {:>10}",
"doc(KB)", "extract_us", "llm_us", "p50_us", "p90_us"
);
let mut grand_extract = 0u128;
let mut grand_llm = 0u128;
@ -143,8 +160,13 @@ fn bench(iters: usize) {
for path in &files {
let html = fs::read_to_string(path).unwrap_or_default();
if html.is_empty() { continue; }
let url = format!("https://corpus/{}", path.file_name().unwrap().to_string_lossy());
if html.is_empty() {
continue;
}
let url = format!(
"https://corpus/{}",
path.file_name().unwrap().to_string_lossy()
);
// warmup
for _ in 0..5 {
@ -158,7 +180,10 @@ fn bench(iters: usize) {
let mut total_times = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let ex = match extract(&html, Some(&url)) { Ok(e) => e, Err(_) => continue };
let ex = match extract(&html, Some(&url)) {
Ok(e) => e,
Err(_) => continue,
};
let t1 = Instant::now();
let txt = to_llm_text(&ex, Some(&url));
let t2 = Instant::now();
@ -178,11 +203,24 @@ fn bench(iters: usize) {
grand_llm += llm_p50;
grand_total_p50 += tot_p50;
let label = format!("{} ({}KB)", path.file_stem().unwrap().to_string_lossy(), html.len() / 1024);
println!("{:<42} {:>10} {:>10} {:>10} {:>10}", label.chars().take(42).collect::<String>(), ex_p50, llm_p50, tot_p50, tot_p90);
let label = format!(
"{} ({}KB)",
path.file_stem().unwrap().to_string_lossy(),
html.len() / 1024
);
println!(
"{:<42} {:>10} {:>10} {:>10} {:>10}",
label.chars().take(42).collect::<String>(),
ex_p50,
llm_p50,
tot_p50,
tot_p90
);
}
println!("---");
println!("CORPUS_PASS_P50_SUM_US extract={grand_extract} llm={grand_llm} total={grand_total_p50}");
println!(
"CORPUS_PASS_P50_SUM_US extract={grand_extract} llm={grand_llm} total={grand_total_p50}"
);
println!("(lower is better; total = one full extract+llm pass over the whole corpus at p50)");
}

View file

@ -198,7 +198,7 @@ fn fmt_int(n: usize) -> String {
let bytes = s.as_bytes();
let mut out = String::with_capacity(s.len() + s.len() / 3);
for (i, b) in bytes.iter().enumerate() {
if i > 0 && (bytes.len() - i).is_multiple_of(3) {
if i > 0 && (bytes.len() - i) % 3 == 0 {
out.push(',');
}
out.push(*b as char);

View file

@ -0,0 +1,324 @@
//! CLI argument definitions: clap structs/enums and their conversions.
use std::path::PathBuf;
use clap::{Parser, Subcommand, ValueEnum};
use webclaw_fetch::BrowserProfile;
use webclaw_pdf::PdfMode;
#[derive(Parser)]
#[command(name = "webclaw", about = "Extract web content for LLMs", version)]
pub struct Cli {
/// Optional subcommand. When omitted, the CLI falls back to the
/// traditional flag-based flow (URL + --format, --crawl, etc.).
/// Subcommands are used for flows that don't fit that model.
#[command(subcommand)]
pub command: Option<Commands>,
/// URLs to fetch (multiple allowed)
#[arg()]
pub urls: Vec<String>,
/// File with URLs (one per line)
#[arg(long)]
pub urls_file: Option<String>,
/// Output format (markdown, json, text, llm, html)
#[arg(short, long, default_value = "markdown")]
pub format: OutputFormat,
/// Browser to impersonate
#[arg(short, long, default_value = "chrome")]
pub browser: Browser,
/// Proxy URL (http://user:pass@host:port or socks5://host:port)
#[arg(short, long, env = "WEBCLAW_PROXY")]
pub proxy: Option<String>,
/// File with proxies (host:port:user:pass, one per line). Rotates per request.
#[arg(long, env = "WEBCLAW_PROXY_FILE")]
pub proxy_file: Option<String>,
/// Request timeout in seconds
#[arg(short, long, default_value = "30")]
pub timeout: u64,
/// Extract from local HTML file instead of fetching
#[arg(long)]
pub file: Option<String>,
/// Read HTML from stdin
#[arg(long)]
pub stdin: bool,
/// Include metadata in output (always included in JSON)
#[arg(long)]
pub metadata: bool,
/// Output raw fetched HTML instead of extracting
#[arg(long)]
pub raw_html: bool,
/// CSS selectors to include (comma-separated, e.g. "article,.content")
#[arg(long)]
pub include: Option<String>,
/// CSS selectors to exclude (comma-separated, e.g. "nav,.sidebar,footer")
#[arg(long)]
pub exclude: Option<String>,
/// Only extract main content (article/main element)
#[arg(long)]
pub only_main_content: bool,
/// Custom headers (repeatable, e.g. -H "Cookie: foo=bar")
#[arg(short = 'H', long = "header")]
pub headers: Vec<String>,
/// Cookie string (shorthand for -H "Cookie: ...")
#[arg(long)]
pub cookie: Option<String>,
/// JSON cookie file (Chrome extension format: [{name, value, domain, ...}])
#[arg(long)]
pub cookie_file: Option<String>,
/// Enable verbose logging
#[arg(short, long)]
pub verbose: bool,
/// Compare against a previous JSON snapshot
#[arg(long)]
pub diff_with: Option<String>,
/// Watch a URL for changes. Checks at the specified interval and reports diffs.
#[arg(long)]
pub watch: bool,
/// Watch interval in seconds [default: 300]
#[arg(long, default_value = "300")]
pub watch_interval: u64,
/// Command to run when changes are detected (receives diff JSON on stdin)
#[arg(long)]
pub on_change: Option<String>,
/// Webhook URL: POST a JSON payload when an operation completes.
/// Works with crawl, batch, watch (on change), and single URL modes.
#[arg(long, env = "WEBCLAW_WEBHOOK_URL")]
pub webhook: Option<String>,
/// Extract brand identity (colors, fonts, logo)
#[arg(long)]
pub brand: bool,
// -- PDF options --
/// PDF extraction mode: auto (error on empty) or fast (return whatever text is found)
#[arg(long, default_value = "auto")]
pub pdf_mode: PdfModeArg,
// -- Crawl options --
/// Enable recursive crawling of same-domain links
#[arg(long)]
pub crawl: bool,
/// Max crawl depth [default: 1]
#[arg(long, default_value = "1")]
pub depth: usize,
/// Max pages to crawl [default: 20]
#[arg(long, default_value = "20")]
pub max_pages: usize,
/// Max concurrent requests [default: 5]
#[arg(long, default_value = "5")]
pub concurrency: usize,
/// Delay between requests in ms [default: 100]
#[arg(long, default_value = "100")]
pub delay: u64,
/// Only crawl URLs matching this path prefix
#[arg(long)]
pub path_prefix: Option<String>,
/// Glob patterns for crawl URL paths to include (comma-separated, e.g. "/api/*,/guides/**")
#[arg(long)]
pub include_paths: Option<String>,
/// Glob patterns for crawl URL paths to exclude (comma-separated, e.g. "/changelog/*,/blog/*")
#[arg(long)]
pub exclude_paths: Option<String>,
/// Path to save/resume crawl state. On Ctrl+C: saves progress. On start: resumes if file exists.
#[arg(long)]
pub crawl_state: Option<PathBuf>,
/// Seed crawl frontier from sitemap discovery (robots.txt + /sitemap.xml)
#[arg(long)]
pub sitemap: bool,
/// Discover URLs from sitemap and print them (one per line; JSON array with --format json)
#[arg(long)]
pub map: bool,
// -- LLM options --
/// Extract structured JSON using LLM (pass a JSON schema string or @file)
#[arg(long)]
pub extract_json: Option<String>,
/// Extract using natural language prompt
#[arg(long)]
pub extract_prompt: Option<String>,
/// Summarize content using LLM (optional: number of sentences, default 3)
#[arg(long, num_args = 0..=1, default_missing_value = "3")]
pub summarize: Option<usize>,
/// Force a specific LLM provider (ollama, openai, anthropic)
#[arg(long, env = "WEBCLAW_LLM_PROVIDER")]
pub llm_provider: Option<String>,
/// Override the LLM model name
#[arg(long, env = "WEBCLAW_LLM_MODEL")]
pub llm_model: Option<String>,
/// Override the LLM base URL (Ollama, OpenAI-compatible, or Anthropic-compatible)
#[arg(long, env = "WEBCLAW_LLM_BASE_URL")]
pub llm_base_url: Option<String>,
// -- Cloud API options --
/// Webclaw Cloud API key for automatic fallback on bot-protected or JS-rendered sites
#[arg(long, env = "WEBCLAW_API_KEY")]
pub api_key: Option<String>,
/// Force all requests through the cloud API (skip local extraction)
#[arg(long)]
pub cloud: bool,
/// Run deep research on a topic via the cloud API. Requires --api-key.
/// Saves full result (report + sources + findings) to a JSON file.
#[arg(long)]
pub research: Option<String>,
/// Enable deep research mode (longer, more thorough report). Used with --research.
#[arg(long)]
pub deep: bool,
/// Output directory: save each page to a separate file instead of stdout.
/// Works with --crawl, batch (multiple URLs), and single URL mode.
/// Filenames are derived from URL paths (e.g. /docs/api -> docs/api.md).
#[arg(long)]
pub output_dir: Option<PathBuf>,
}
#[derive(Subcommand)]
pub enum Commands {
/// Per-URL extraction micro-benchmark: compares raw HTML vs. the
/// webclaw --format llm output on token count, bytes, and
/// extraction time. Uses an approximate tokenizer (see `--help`).
Bench {
/// URL to benchmark.
url: String,
/// Emit a single JSON line instead of the ASCII table.
/// Machine-readable shape stable across releases.
#[arg(long)]
json: bool,
/// Optional path to a facts.json (same schema as the repo's
/// benchmarks/facts.json) for a fidelity column.
#[arg(long)]
facts: Option<PathBuf>,
},
/// List all vertical extractors in the catalog.
///
/// Each entry has a stable `name` (usable with `webclaw vertical <name>`),
/// a human-friendly label, a one-line description, and the URL
/// patterns it claims. The same data is served by `/v1/extractors`
/// when running the REST API.
Extractors {
/// Emit JSON instead of a human-friendly table.
#[arg(long)]
json: bool,
},
/// Run a vertical extractor by name. Returns typed JSON with fields
/// specific to the target site (title, price, author, rating, etc.)
/// rather than generic markdown.
///
/// Use `webclaw extractors` to see the full list. Example:
/// `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/`.
Vertical {
/// Vertical name (e.g. `reddit`, `github_repo`, `trustpilot_reviews`).
name: String,
/// URL to extract.
url: String,
/// Emit compact JSON (single line). Default is pretty-printed.
#[arg(long)]
raw: bool,
},
}
#[derive(Clone, ValueEnum)]
pub enum OutputFormat {
Markdown,
Json,
Text,
Llm,
Html,
}
impl OutputFormat {
/// Map to the cloud API's `formats` string. Single source of truth for the
/// format names the REST API expects.
pub fn as_api_str(&self) -> &'static str {
match self {
OutputFormat::Markdown => "markdown",
OutputFormat::Json => "json",
OutputFormat::Text => "text",
OutputFormat::Llm => "llm",
OutputFormat::Html => "html",
}
}
}
#[derive(Clone, ValueEnum)]
pub enum Browser {
Chrome,
Firefox,
/// Safari iOS 26. Pair with a country-matched residential proxy for sites
/// that reject non-mobile profiles.
SafariIos,
Random,
}
#[derive(Clone, ValueEnum, Default)]
pub enum PdfModeArg {
/// Error if PDF has no extractable text (catches scanned PDFs)
#[default]
Auto,
/// Return whatever text is found, even if empty
Fast,
}
impl From<PdfModeArg> for PdfMode {
fn from(arg: PdfModeArg) -> Self {
match arg {
PdfModeArg::Auto => PdfMode::Auto,
PdfModeArg::Fast => PdfMode::Fast,
}
}
}
impl From<Browser> for BrowserProfile {
fn from(b: Browser) -> Self {
match b {
Browser::Chrome => BrowserProfile::Chrome,
Browser::Firefox => BrowserProfile::Firefox,
Browser::SafariIos => BrowserProfile::SafariIos,
Browser::Random => BrowserProfile::Random,
}
}
}

View file

@ -0,0 +1,823 @@
//! Input handling and fetching: config building, URL/cookie parsing, empty-page
//! detection, output-file writing, and the fetch+extract entry points (local,
//! remote, and cloud fallback).
use std::io::{self, Read as _};
use std::path::{Path, PathBuf};
use std::process;
use webclaw_core::{ExtractionOptions, ExtractionResult, extract_with_options};
use webclaw_fetch::{FetchClient, FetchConfig, FetchResult};
use crate::cli::Cli;
/// Known anti-bot challenge page titles (case-insensitive prefix match).
const ANTIBOT_TITLES: &[&str] = &[
"just a moment",
"attention required",
"access denied",
"checking your browser",
"please wait",
"one more step",
"verify you are human",
"bot verification",
"security check",
"ddos protection",
];
/// URL host/path fragments that indicate a GDPR/cookie consent redirect.
const CONSENT_URL_FRAGMENTS: &[&str] = &[
"://consent.",
"/consent?",
"/consent/",
"collectconsent",
"consentcheck",
"/cmp/",
"guce.advertising.com",
];
/// English consent-wall title prefixes. Many providers localize this page, so
/// this is a best-effort secondary signal. URL shape is the primary signal.
const CONSENT_TITLES: &[&str] = &[
"before you continue",
"your privacy choices",
"we value your privacy",
"we care about your privacy",
"cookie consent",
"consent required",
];
/// Detect why a page returned empty or near-empty content.
#[derive(Debug, PartialEq, Eq)]
pub enum EmptyReason {
/// Anti-bot challenge page (Cloudflare, Akamai, etc.)
Antibot,
/// GDPR/cookie consent redirect.
ConsentWall,
/// JS-only SPA that returns an empty shell without a browser
JsRequired,
/// Page has content.
None,
}
pub fn detect_empty(result: &ExtractionResult) -> EmptyReason {
// Consent walls can have a tiny body, so check before the content
// short-circuit.
if is_consent_wall(result) {
return EmptyReason::ConsentWall;
}
// Has real content. Nothing to warn about.
if result.metadata.word_count > 50 || !result.content.markdown.is_empty() {
return EmptyReason::None;
}
// Check for known anti-bot challenge titles
if let Some(ref title) = result.metadata.title {
let lower = title.to_lowercase();
if ANTIBOT_TITLES.iter().any(|t| lower.starts_with(t)) {
return EmptyReason::Antibot;
}
}
// Empty content with no title or a generic SPA shell = JS-only site
if result.metadata.word_count == 0 && result.content.links.is_empty() {
return EmptyReason::JsRequired;
}
EmptyReason::None
}
/// A consent wall is identified by either:
/// 1. The final URL pointing at a known consent host/path, or
/// 2. A consent-wall title prefix with a very small body.
fn is_consent_wall(result: &ExtractionResult) -> bool {
if let Some(ref url) = result.metadata.url {
let lower = url.to_ascii_lowercase();
if CONSENT_URL_FRAGMENTS
.iter()
.any(|fragment| lower.contains(fragment))
{
return true;
}
}
if result.metadata.word_count <= 50
&& let Some(ref title) = result.metadata.title
{
let lower = title.to_lowercase();
if CONSENT_TITLES
.iter()
.any(|prefix| lower.starts_with(prefix))
{
return true;
}
}
false
}
pub fn warn_empty(url: &str, reason: &EmptyReason) {
match reason {
EmptyReason::Antibot => eprintln!(
"\x1b[33mwarning:\x1b[0m Anti-bot protection detected on {url}\n\
This site requires CAPTCHA solving or browser rendering.\n\
Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing"
),
EmptyReason::ConsentWall => eprintln!(
"\x1b[33mwarning:\x1b[0m GDPR/cookie consent wall detected on {url}\n\
The site redirected to a consent page and returned no usable content.\n\
Try a different region via --proxy, or pass a pre-accepted consent cookie\n\
via --cookie / --cookie-file."
),
EmptyReason::JsRequired => eprintln!(
"\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\
This site requires JavaScript rendering (SPA).\n\
Use the webclaw Cloud API for JS rendering: https://webclaw.io/pricing"
),
EmptyReason::None => {}
}
}
/// Build FetchConfig from CLI flags.
///
/// `--proxy` sets a single static proxy (no rotation).
/// `--proxy-file` loads a pool of proxies and rotates per-request.
/// `--proxy` takes priority: if both are set, only the single proxy is used.
pub fn build_fetch_config(cli: &Cli) -> FetchConfig {
let (proxy, proxy_pool) = if cli.proxy.is_some() {
(cli.proxy.clone(), Vec::new())
} else if let Some(ref path) = cli.proxy_file {
match webclaw_fetch::parse_proxy_file(path) {
Ok(pool) => (None, pool),
Err(e) => {
eprintln!("warning: {e}");
(None, Vec::new())
}
}
} else if std::path::Path::new("proxies.txt").exists() {
// Auto-load proxies.txt from working directory if present
match webclaw_fetch::parse_proxy_file("proxies.txt") {
Ok(pool) if !pool.is_empty() => {
eprintln!("loaded {} proxies from proxies.txt", pool.len());
(None, pool)
}
_ => (None, Vec::new()),
}
} else {
(None, Vec::new())
};
let mut headers = std::collections::HashMap::from([(
"Accept-Language".to_string(),
"en-US,en;q=0.9".to_string(),
)]);
// Parse -H "Key: Value" flags
for h in &cli.headers {
if let Some((key, val)) = h.split_once(':') {
headers.insert(key.trim().to_string(), val.trim().to_string());
}
}
// --cookie shorthand
if let Some(ref cookie) = cli.cookie {
headers.insert("Cookie".to_string(), cookie.clone());
}
// --cookie-file: parse JSON array of {name, value, domain, ...}
if let Some(ref path) = cli.cookie_file {
match parse_cookie_file(path) {
Ok(cookie_str) => {
// Merge with existing cookies if --cookie was also provided
if let Some(existing) = headers.get("Cookie") {
headers.insert("Cookie".to_string(), format!("{existing}; {cookie_str}"));
} else {
headers.insert("Cookie".to_string(), cookie_str);
}
}
Err(e) => {
eprintln!("error: failed to parse cookie file: {e}");
process::exit(1);
}
}
}
FetchConfig {
browser: cli.browser.clone().into(),
proxy,
proxy_pool,
timeout: std::time::Duration::from_secs(cli.timeout),
pdf_mode: cli.pdf_mode.clone().into(),
headers,
..Default::default()
}
}
/// Parse a JSON cookie file (Chrome extension format) into a Cookie header string.
/// Supports: [{name, value, domain, path, secure, httpOnly, expirationDate, ...}]
fn parse_cookie_file(path: &str) -> Result<String, String> {
let content = std::fs::read_to_string(path).map_err(|e| format!("cannot read {path}: {e}"))?;
let cookies: Vec<serde_json::Value> =
serde_json::from_str(&content).map_err(|e| format!("invalid JSON: {e}"))?;
let pairs: Vec<String> = cookies
.iter()
.filter_map(|c| {
let name = c.get("name")?.as_str()?;
let value = c.get("value")?.as_str()?;
Some(format!("{name}={value}"))
})
.collect();
if pairs.is_empty() {
return Err("no cookies found in file".to_string());
}
Ok(pairs.join("; "))
}
pub fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
ExtractionOptions {
include_selectors: cli
.include
.as_deref()
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
.unwrap_or_default(),
exclude_selectors: cli
.exclude
.as_deref()
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
.unwrap_or_default(),
only_main_content: cli.only_main_content,
include_raw_html: cli.raw_html || matches!(cli.format, crate::cli::OutputFormat::Html),
}
}
/// Normalize a URL: prepend `https://` if no scheme is present.
pub fn normalize_url(url: &str) -> String {
let trimmed = url.trim();
if trimmed.contains("://") {
trimmed.to_string()
} else {
format!("https://{trimmed}")
}
}
/// Derive a filename from a URL for `--output-dir`.
///
/// Strips the scheme/host, maps the path to a filesystem path, and appends
/// an extension matching the output format.
pub fn url_to_filename(raw_url: &str, format: &crate::cli::OutputFormat) -> String {
use crate::cli::OutputFormat;
let ext = match format {
OutputFormat::Markdown | OutputFormat::Llm => "md",
OutputFormat::Json => "json",
OutputFormat::Text => "txt",
OutputFormat::Html => "html",
};
let parsed = url::Url::parse(raw_url);
let (host, path, query) = match &parsed {
Ok(u) => (
u.host_str().unwrap_or("unknown").to_string(),
u.path().to_string(),
u.query().map(String::from),
),
Err(_) => (String::new(), String::new(), None),
};
// Drop empty / "." / ".." path segments so a URL path like
// `/../../etc/passwd` can't climb out of the output directory.
let cleaned_path: String = path
.split('/')
.filter(|seg| !seg.is_empty() && *seg != "." && *seg != "..")
.collect::<Vec<_>>()
.join("/");
let mut stem = cleaned_path;
if stem.is_empty() {
// Use hostname for root URLs to avoid collisions in batch mode
let clean_host = host.strip_prefix("www.").unwrap_or(&host);
stem = format!("{}/index", clean_host.replace('.', "_"));
}
// Append query params so /p?id=123 doesn't collide with /p?id=456
if let Some(q) = query {
stem = format!("{stem}_{q}");
}
// Sanitize: keep alphanumeric, dash, underscore, dot, slash
let sanitized: String = stem
.chars()
.map(|c| {
if c.is_alphanumeric() || matches!(c, '-' | '_' | '.' | '/') {
c
} else {
'_'
}
})
.collect();
format!("{sanitized}.{ext}")
}
/// Reject a caller-supplied (CSV `url,filename`) name that could escape the
/// output directory: absolute paths, drive prefixes, root, or any `..`
/// component. Returns the validated relative path on success.
fn safe_relative_filename(filename: &str) -> Result<PathBuf, String> {
let candidate = Path::new(filename);
use std::path::Component;
for comp in candidate.components() {
match comp {
Component::Normal(_) | Component::CurDir => {}
Component::ParentDir => {
return Err(format!("refusing path with '..' component: {filename}"));
}
Component::RootDir | Component::Prefix(_) => {
return Err(format!("refusing absolute output path: {filename}"));
}
}
}
if candidate.as_os_str().is_empty() {
return Err("empty output filename".to_string());
}
Ok(candidate.to_path_buf())
}
/// Write extraction output to a file inside `dir`, creating parent dirs as needed.
///
/// `filename` may originate from an attacker-controlled `--urls-file`
/// (`url,filename` CSV). It is validated for traversal, and the canonical
/// destination directory is asserted to stay under the canonical output
/// directory before any write.
pub fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> {
let rel = safe_relative_filename(filename)?;
let dest = dir.join(&rel);
std::fs::create_dir_all(dir)
.map_err(|e| format!("failed to create directory {}: {e}", dir.display()))?;
let base = dir
.canonicalize()
.map_err(|e| format!("failed to resolve output dir {}: {e}", dir.display()))?;
if let Some(parent) = dest.parent() {
std::fs::create_dir_all(parent)
.map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?;
let canon_parent = parent
.canonicalize()
.map_err(|e| format!("failed to resolve {}: {e}", parent.display()))?;
if !canon_parent.starts_with(&base) {
return Err(format!(
"refusing to write outside output dir: {}",
dest.display()
));
}
}
std::fs::write(&dest, content)
.map_err(|e| format!("failed to write {}: {e}", dest.display()))?;
let word_count = content.split_whitespace().count();
eprintln!("Saved: {} ({word_count} words)", dest.display());
Ok(())
}
/// Collect all URLs from positional args + --urls-file, normalizing bare domains.
///
/// Returns `(url, optional_custom_filename)` pairs. Custom filenames come from
/// CSV-style lines in `--urls-file`: `url,filename`. Plain lines (no comma) get
/// `None` so the caller auto-generates the filename from the URL.
pub fn collect_urls(cli: &Cli) -> Result<Vec<(String, Option<String>)>, String> {
let mut entries: Vec<(String, Option<String>)> =
cli.urls.iter().map(|u| (normalize_url(u), None)).collect();
if let Some(ref path) = cli.urls_file {
let content =
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
for line in content.lines() {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
if let Some((url_part, name_part)) = trimmed.split_once(',') {
let name = name_part.trim();
let custom = if name.is_empty() {
None
} else {
Some(name.to_string())
};
entries.push((normalize_url(url_part.trim()), custom));
} else {
entries.push((normalize_url(trimmed), None));
}
}
}
Ok(entries)
}
/// Result that can be either a local extraction or a cloud API JSON response.
pub enum FetchOutput {
Local(Box<ExtractionResult>),
Cloud(serde_json::Value),
}
impl FetchOutput {
/// Get the local ExtractionResult, or try to parse it from the cloud response.
pub fn into_extraction(self) -> Result<ExtractionResult, String> {
match self {
FetchOutput::Local(r) => Ok(*r),
FetchOutput::Cloud(resp) => {
// Cloud response has an "extraction" field with the full ExtractionResult
resp.get("extraction")
.and_then(|v| serde_json::from_value(v.clone()).ok())
.or_else(|| serde_json::from_value(resp.clone()).ok())
.ok_or_else(|| "could not parse extraction from cloud response".to_string())
}
}
}
}
/// Fetch a URL and extract content, handling PDF detection automatically.
/// Falls back to cloud API when bot protection or JS rendering is detected.
pub async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
// Local sources: read and extract as HTML
if cli.stdin {
let mut buf = String::new();
io::stdin()
.read_to_string(&mut buf)
.map_err(|e| format!("failed to read stdin: {e}"))?;
let options = build_extraction_options(cli);
return extract_with_options(&buf, None, &options)
.map(|r| FetchOutput::Local(Box::new(r)))
.map_err(|e| format!("extraction error: {e}"));
}
if let Some(ref path) = cli.file {
let html =
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
let options = build_extraction_options(cli);
return extract_with_options(&html, None, &options)
.map(|r| FetchOutput::Local(Box::new(r)))
.map_err(|e| format!("extraction error: {e}"));
}
let raw_url = cli
.urls
.first()
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
let url = normalize_url(raw_url);
let url = url.as_str();
let cloud_client = webclaw_fetch::cloud::CloudClient::new(cli.api_key.as_deref());
// --cloud: skip local, go straight to cloud API
if cli.cloud {
let c =
cloud_client.ok_or("--cloud requires WEBCLAW_API_KEY (set via env or --api-key)")?;
let options = build_extraction_options(cli);
let resp = c
.scrape(
url,
&[cli.format.as_api_str()],
&options.include_selectors,
&options.exclude_selectors,
options.only_main_content,
)
.await?;
return Ok(FetchOutput::Cloud(resp));
}
// Normal path: try local first
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(cli);
let result = client
.fetch_and_extract_with_options(url, &options)
.await
.map_err(|e| format!("fetch error: {e}"))?;
// Check if we should fall back to cloud
let reason = detect_empty(&result);
if !matches!(reason, EmptyReason::None) {
if let Some(ref c) = cloud_client {
eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API...");
match c
.scrape(
url,
&[cli.format.as_api_str()],
&options.include_selectors,
&options.exclude_selectors,
options.only_main_content,
)
.await
{
Ok(resp) => return Ok(FetchOutput::Cloud(resp)),
Err(e) => {
eprintln!("\x1b[33mwarning:\x1b[0m cloud fallback failed: {e}");
// Fall through to return the local result with a warning
}
}
}
warn_empty(url, &reason);
}
Ok(FetchOutput::Local(Box::new(result)))
}
/// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction.
pub async fn fetch_html(cli: &Cli) -> Result<FetchResult, String> {
if cli.stdin {
let mut buf = String::new();
io::stdin()
.read_to_string(&mut buf)
.map_err(|e| format!("failed to read stdin: {e}"))?;
return Ok(FetchResult {
html: buf,
url: String::new(),
status: 200,
headers: Default::default(),
elapsed: Default::default(),
});
}
if let Some(ref path) = cli.file {
let html =
std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
return Ok(FetchResult {
html,
url: String::new(),
status: 200,
headers: Default::default(),
elapsed: Default::default(),
});
}
let raw_url = cli
.urls
.first()
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
let url = normalize_url(raw_url);
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
client
.fetch(&url)
.await
.map_err(|e| format!("fetch error: {e}"))
}
/// Fetch external stylesheets referenced in HTML and inject them as `<style>` blocks.
/// This allows brand extraction to see colors/fonts from external CSS files.
pub async fn enrich_html_with_stylesheets(html: &str, base_url: &str) -> String {
let base = match url::Url::parse(base_url) {
Ok(u) => u,
Err(_) => return html.to_string(),
};
// Extract stylesheet hrefs from <link rel="stylesheet" href="...">
let re = regex::Regex::new(
r#"<link[^>]+rel=["']stylesheet["'][^>]+href=["']([^"']+)["']|<link[^>]+href=["']([^"']+)["'][^>]+rel=["']stylesheet["']"#
).unwrap();
let hrefs: Vec<String> = re
.captures_iter(html)
.filter_map(|cap| {
let href = cap.get(1).or(cap.get(2))?;
Some(
base.join(href.as_str())
.map(|u| u.to_string())
.unwrap_or_else(|_| href.as_str().to_string()),
)
})
.take(10)
.collect();
if hrefs.is_empty() {
return html.to_string();
}
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(5))
.redirect(reqwest::redirect::Policy::none())
.build()
.unwrap_or_default();
let mut extra_css = String::new();
for href in &hrefs {
if webclaw_fetch::url_security::validate_public_http_url(href)
.await
.is_err()
{
continue;
}
if let Ok(resp) = client.get(href).send().await
&& resp.status().is_success()
&& let Ok(body) = resp.text().await
&& !body.trim_start().starts_with("<!")
&& body.len() < 2_000_000
{
extra_css.push_str("\n<style>\n");
extra_css.push_str(&body);
extra_css.push_str("\n</style>\n");
}
}
if extra_css.is_empty() {
return html.to_string();
}
if let Some(pos) = html.to_lowercase().find("</head>") {
let mut enriched = String::with_capacity(html.len() + extra_css.len());
enriched.push_str(&html[..pos]);
enriched.push_str(&extra_css);
enriched.push_str(&html[pos..]);
enriched
} else {
format!("{extra_css}{html}")
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::cli::OutputFormat;
use webclaw_core::{Content, Metadata};
fn empty_result(title: Option<&str>, url: Option<&str>, markdown: &str) -> ExtractionResult {
let metadata = Metadata::default()
.with_title(title.map(str::to_string))
.with_url(url.map(str::to_string))
.with_word_count(markdown.split_whitespace().count());
let content = Content::default()
.with_markdown(markdown.to_string())
.with_plain_text(markdown.to_string());
ExtractionResult::new(metadata, content)
}
#[test]
fn detect_empty_identifies_consent_redirect_url() {
let result = empty_result(
Some("Yahoo"),
Some("https://guce.advertising.com/collectIdentifiers?sessionId=abc"),
"Continue",
);
assert_eq!(detect_empty(&result), EmptyReason::ConsentWall);
}
#[test]
fn detect_empty_identifies_short_consent_title() {
let result = empty_result(
Some("Before you continue"),
Some("https://www.google.com/"),
"Review privacy options",
);
assert_eq!(detect_empty(&result), EmptyReason::ConsentWall);
}
#[test]
fn detect_empty_does_not_flag_real_content_with_consent_words() {
let result = empty_result(
Some("Cookie consent patterns explained"),
Some("https://example.com/blog"),
"This article explains cookie consent patterns for product teams with enough real body text to be useful. It covers consent banners, privacy controls, analytics configuration, regional requirements, product tradeoffs, implementation details, testing flows, debugging notes, accessibility needs, and operational lessons from real teams shipping public websites across multiple markets. It also explains measurement, rollout planning, copy review, support workflows, design constraints, release notes, and how to keep privacy choices understandable for users.",
);
assert_eq!(detect_empty(&result), EmptyReason::None);
}
#[test]
fn url_to_filename_root() {
assert_eq!(
url_to_filename("https://example.com/", &OutputFormat::Markdown),
"example_com/index.md"
);
assert_eq!(
url_to_filename("https://example.com", &OutputFormat::Markdown),
"example_com/index.md"
);
}
#[test]
fn url_to_filename_path() {
assert_eq!(
url_to_filename("https://example.com/docs/api", &OutputFormat::Markdown),
"docs/api.md"
);
}
#[test]
fn url_to_filename_trailing_slash() {
assert_eq!(
url_to_filename("https://example.com/docs/api/", &OutputFormat::Markdown),
"docs/api.md"
);
}
#[test]
fn url_to_filename_nested_path() {
assert_eq!(
url_to_filename("https://example.com/blog/my-post", &OutputFormat::Markdown),
"blog/my-post.md"
);
}
#[test]
fn url_to_filename_query_params() {
assert_eq!(
url_to_filename("https://example.com/p?id=123", &OutputFormat::Markdown),
"p_id_123.md"
);
}
#[test]
fn url_to_filename_json_format() {
assert_eq!(
url_to_filename("https://example.com/docs/api", &OutputFormat::Json),
"docs/api.json"
);
}
#[test]
fn url_to_filename_text_format() {
assert_eq!(
url_to_filename("https://example.com/docs/api", &OutputFormat::Text),
"docs/api.txt"
);
}
#[test]
fn url_to_filename_llm_format() {
assert_eq!(
url_to_filename("https://example.com/docs/api", &OutputFormat::Llm),
"docs/api.md"
);
}
#[test]
fn url_to_filename_html_format() {
assert_eq!(
url_to_filename("https://example.com/docs/api", &OutputFormat::Html),
"docs/api.html"
);
}
#[test]
fn url_to_filename_special_chars() {
// Spaces and special chars get replaced with underscores
assert_eq!(
url_to_filename(
"https://example.com/path%20with%20spaces",
&OutputFormat::Markdown
),
"path_20with_20spaces.md"
);
}
#[test]
fn write_to_file_creates_dirs() {
let dir = std::env::temp_dir().join("webclaw_test_output_dir");
let _ = std::fs::remove_dir_all(&dir);
write_to_file(&dir, "nested/deep/file.md", "hello").unwrap();
let content = std::fs::read_to_string(dir.join("nested/deep/file.md")).unwrap();
assert_eq!(content, "hello");
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn url_to_filename_strips_traversal_segments() {
// `..` / `.` / empty path segments must not survive into the path.
let out = url_to_filename(
"https://example.com/../../etc/passwd",
&OutputFormat::Markdown,
);
assert!(!out.contains(".."), "traversal leaked: {out}");
assert_eq!(out, "etc/passwd.md");
let out2 = url_to_filename("https://example.com/a/./b//c", &OutputFormat::Json);
assert_eq!(out2, "a/b/c.json");
}
#[test]
fn safe_relative_filename_rejects_escapes() {
assert!(safe_relative_filename("../escape.md").is_err());
assert!(safe_relative_filename("a/../../b.md").is_err());
assert!(safe_relative_filename("/etc/passwd").is_err());
assert!(safe_relative_filename("").is_err());
// Normal nested relative names stay allowed.
assert!(safe_relative_filename("nested/deep/file.md").is_ok());
assert!(safe_relative_filename("./ok.md").is_ok());
}
#[test]
fn write_to_file_refuses_traversal_filename() {
let dir = std::env::temp_dir().join("webclaw_test_traversal_dir");
let _ = std::fs::remove_dir_all(&dir);
// CSV-supplied `url,filename` traversal attempt.
let err = write_to_file(&dir, "../../tmp/webclaw_pwned.md", "x").unwrap_err();
assert!(err.contains("refusing"), "unexpected error: {err}");
assert!(
!std::path::Path::new("/tmp/webclaw_pwned.md").exists(),
"traversal write escaped the output dir"
);
let _ = std::fs::remove_dir_all(&dir);
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,376 @@
//! Output formatting and rendering for every CLI mode.
//!
//! `render_one` is the single source of truth for turning one
//! `ExtractionResult` into a standalone document for a given format. The
//! `print_*`/`format_*` functions own iteration and separator logic and
//! delegate the per-page body to `render_one`.
use webclaw_core::{ContentDiff, ExtractionResult, Metadata, to_llm_text};
use webclaw_fetch::{BatchExtractResult, CrawlResult, PageResult, SitemapEntry};
use crate::cli::OutputFormat;
/// Get raw HTML from an extraction result, falling back to markdown if unavailable.
pub fn raw_html_or_markdown(result: &ExtractionResult) -> &str {
result
.content
.raw_html
.as_deref()
.unwrap_or(&result.content.markdown)
}
pub fn format_frontmatter(meta: &Metadata) -> String {
let mut lines = vec!["---".to_string()];
if let Some(title) = &meta.title {
lines.push(format!("title: \"{title}\""));
}
if let Some(author) = &meta.author {
lines.push(format!("author: \"{author}\""));
}
if let Some(date) = &meta.published_date {
lines.push(format!("date: \"{date}\""));
}
if let Some(url) = &meta.url {
lines.push(format!("source: \"{url}\""));
}
if meta.word_count > 0 {
lines.push(format!("word_count: {}", meta.word_count));
}
lines.push("---".to_string());
lines.push(String::new()); // blank line after frontmatter
lines.join("\n")
}
/// Render a single `ExtractionResult` into a standalone document string for the
/// given format. The Llm format derives its source URL from `metadata.url`.
///
/// This is the single per-page renderer behind `format_output` and
/// `print_output`. Callers own the iteration and separator framing.
pub fn render_one(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String {
match format {
OutputFormat::Markdown => {
let mut out = String::new();
if show_metadata {
out.push_str(&format_frontmatter(&result.metadata));
}
out.push_str(&result.content.markdown);
if !result.structured_data.is_empty() {
out.push_str("\n\n## Structured Data\n\n```json\n");
out.push_str(
&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(),
);
out.push_str("\n```");
}
out
}
OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
OutputFormat::Text => result.content.plain_text.clone(),
OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
OutputFormat::Html => raw_html_or_markdown(result).to_string(),
}
}
/// Format an `ExtractionResult` into a string for the given output format.
pub fn format_output(
result: &ExtractionResult,
format: &OutputFormat,
show_metadata: bool,
) -> String {
render_one(result, format, show_metadata)
}
pub fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) {
println!("{}", render_one(result, format, show_metadata));
}
/// Print cloud API response in the requested format.
pub fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
match format {
OutputFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(resp).expect("serialization failed")
);
}
OutputFormat::Markdown => {
// Cloud response has content.markdown
if let Some(md) = resp
.get("content")
.and_then(|c| c.get("markdown"))
.and_then(|m| m.as_str())
{
println!("{md}");
} else if let Some(md) = resp.get("markdown").and_then(|m| m.as_str()) {
println!("{md}");
} else {
println!(
"{}",
serde_json::to_string_pretty(resp).expect("serialization failed")
);
}
}
OutputFormat::Text => {
if let Some(txt) = resp
.get("content")
.and_then(|c| c.get("plain_text"))
.and_then(|t| t.as_str())
{
println!("{txt}");
} else {
// Fallback to markdown or raw JSON
print_cloud_output(resp, &OutputFormat::Markdown);
}
}
OutputFormat::Llm => {
if let Some(llm) = resp
.get("content")
.and_then(|c| c.get("llm_text"))
.and_then(|t| t.as_str())
{
println!("{llm}");
} else {
print_cloud_output(resp, &OutputFormat::Markdown);
}
}
OutputFormat::Html => {
if let Some(html) = resp
.get("content")
.and_then(|c| c.get("raw_html"))
.and_then(|h| h.as_str())
{
println!("{html}");
} else {
print_cloud_output(resp, &OutputFormat::Markdown);
}
}
}
}
pub fn print_diff_output(diff: &ContentDiff, format: &OutputFormat) {
match format {
OutputFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(diff).expect("serialization failed")
);
}
// For markdown/text/llm, show a human-readable summary
_ => {
println!("Status: {:?}", diff.status);
println!("Word count delta: {:+}", diff.word_count_delta);
if !diff.metadata_changes.is_empty() {
println!("\nMetadata changes:");
for change in &diff.metadata_changes {
println!(
" {}: {} -> {}",
change.field,
change.old.as_deref().unwrap_or("(none)"),
change.new.as_deref().unwrap_or("(none)"),
);
}
}
if !diff.links_added.is_empty() {
println!("\nLinks added:");
for link in &diff.links_added {
println!(" + {} ({})", link.href, link.text);
}
}
if !diff.links_removed.is_empty() {
println!("\nLinks removed:");
for link in &diff.links_removed {
println!(" - {} ({})", link.href, link.text);
}
}
if let Some(ref text_diff) = diff.text_diff {
println!("\n{text_diff}");
}
}
}
}
pub fn print_crawl_output(result: &CrawlResult, format: &OutputFormat, show_metadata: bool) {
match format {
OutputFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(result).expect("serialization failed")
);
}
OutputFormat::Markdown => {
for page in &result.pages {
let Some(ref extraction) = page.extraction else {
continue;
};
println!("---");
println!("# Page: {}\n", page.url);
if show_metadata {
print!("{}", format_frontmatter(&extraction.metadata));
}
println!("{}", extraction.content.markdown);
println!();
}
}
OutputFormat::Text => {
for page in &result.pages {
let Some(ref extraction) = page.extraction else {
continue;
};
println!("---");
println!("# Page: {}\n", page.url);
println!("{}", extraction.content.plain_text);
println!();
}
}
OutputFormat::Llm => {
for page in &result.pages {
let Some(ref extraction) = page.extraction else {
continue;
};
println!("---");
println!("{}", to_llm_text(extraction, Some(page.url.as_str())));
println!();
}
}
OutputFormat::Html => {
for page in &result.pages {
let Some(ref extraction) = page.extraction else {
continue;
};
println!("---");
println!("<!-- Page: {} -->\n", page.url);
println!("{}", raw_html_or_markdown(extraction));
println!();
}
}
}
}
pub fn print_batch_output(
results: &[BatchExtractResult],
format: &OutputFormat,
show_metadata: bool,
) {
match format {
OutputFormat::Json => {
// Build a JSON array of {url, result?, error?} objects
let entries: Vec<serde_json::Value> = results
.iter()
.map(|r| match &r.result {
Ok(extraction) => serde_json::json!({
"url": r.url,
"result": extraction,
}),
Err(e) => serde_json::json!({
"url": r.url,
"error": e.to_string(),
}),
})
.collect();
println!(
"{}",
serde_json::to_string_pretty(&entries).expect("serialization failed")
);
}
OutputFormat::Markdown => {
for r in results {
match &r.result {
Ok(extraction) => {
println!("---");
println!("# {}\n", r.url);
if show_metadata {
print!("{}", format_frontmatter(&extraction.metadata));
}
println!("{}", extraction.content.markdown);
println!();
}
Err(e) => {
eprintln!("error: {} -- {}", r.url, e);
}
}
}
}
OutputFormat::Text => {
for r in results {
match &r.result {
Ok(extraction) => {
println!("---");
println!("# {}\n", r.url);
println!("{}", extraction.content.plain_text);
println!();
}
Err(e) => {
eprintln!("error: {} -- {}", r.url, e);
}
}
}
}
OutputFormat::Llm => {
for r in results {
match &r.result {
Ok(extraction) => {
println!("---");
println!("{}", to_llm_text(extraction, Some(r.url.as_str())));
println!();
}
Err(e) => {
eprintln!("error: {} -- {}", r.url, e);
}
}
}
}
OutputFormat::Html => {
for r in results {
match &r.result {
Ok(extraction) => {
println!("---");
println!("<!-- {} -->\n", r.url);
println!("{}", raw_html_or_markdown(extraction));
println!();
}
Err(e) => {
eprintln!("error: {} -- {}", r.url, e);
}
}
}
}
}
}
pub fn print_map_output(entries: &[SitemapEntry], format: &OutputFormat) {
match format {
OutputFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(entries).expect("serialization failed")
);
}
_ => {
for entry in entries {
println!("{}", entry.url);
}
}
}
}
/// Format a streaming progress line for a completed page.
pub fn format_progress(page: &PageResult, index: usize, max_pages: usize) -> String {
let status = if page.error.is_some() { "ERR" } else { "OK " };
let timing = format!("{}ms", page.elapsed.as_millis());
let detail = if let Some(ref extraction) = page.extraction {
format!(", {} words", extraction.metadata.word_count)
} else if let Some(ref err) = page.error {
format!(" ({err})")
} else {
String::new()
};
format!(
"[{index}/{max_pages}] {status} {} ({timing}{detail})",
page.url
)
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,121 @@
//! Webhook delivery and `--on-change` command execution.
/// Spawn the `--on-change` command with `payload` on stdin.
///
/// Previously this passed the entire user-provided string to `sh -c`, which
/// made `--on-change 'notify "$URL"; rm -rf /'` a plausible disaster the
/// moment an untrusted config file or MCP-driven agent fed us a command.
/// The MCP surface specifically is prompt-injection-exposed: an LLM that
/// controls CLI args can escalate into arbitrary shell on the host.
///
/// We now parse the command with `shlex` (POSIX-ish tokenization with proper
/// quoting) and exec the program directly without an intermediate shell, so
/// metacharacters like `;`, `&&`, `|`, `$()`, and env expansion can't fire.
/// Users who genuinely need a pipeline can set the whole chain behind a
/// script they've written, or opt in per-call via `WEBCLAW_ALLOW_SHELL=1`
/// (documented escape hatch, noisy by design).
pub async fn spawn_on_change(cmd: &str, stdin_payload: &[u8]) {
eprintln!("[watch] Running: {cmd}");
let allow_shell = std::env::var("WEBCLAW_ALLOW_SHELL")
.map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
.unwrap_or(false);
let mut command = if allow_shell {
eprintln!("[watch] WEBCLAW_ALLOW_SHELL=1 — executing via sh -c (unsafe)");
let mut c = tokio::process::Command::new("sh");
c.arg("-c").arg(cmd);
c
} else {
let Some(argv) = shlex::split(cmd) else {
eprintln!("[watch] Failed to parse --on-change command (unbalanced quotes?)");
return;
};
let Some((program, args)) = argv.split_first() else {
eprintln!("[watch] --on-change command is empty");
return;
};
let mut c = tokio::process::Command::new(program);
c.args(args);
c
};
command.stdin(std::process::Stdio::piped());
match command.spawn() {
Ok(mut child) => {
if let Some(mut stdin) = child.stdin.take() {
use tokio::io::AsyncWriteExt;
let _ = stdin.write_all(stdin_payload).await;
}
}
Err(e) => eprintln!("[watch] Failed to run command: {e}"),
}
}
/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr.
/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly.
pub fn fire_webhook(url: &str, payload: &serde_json::Value) {
let url = url.to_string();
let is_discord = url.contains("discord.com/api/webhooks");
let is_slack = url.contains("hooks.slack.com");
let body = if is_discord {
let event = payload
.get("event")
.and_then(|v| v.as_str())
.unwrap_or("notification");
let details = serde_json::to_string_pretty(payload).unwrap_or_default();
serde_json::json!({
"embeds": [{
"title": format!("webclaw: {event}"),
"description": format!("```json\n{details}\n```"),
"color": 5814783
}]
})
.to_string()
} else if is_slack {
let event = payload
.get("event")
.and_then(|v| v.as_str())
.unwrap_or("notification");
let details = serde_json::to_string_pretty(payload).unwrap_or_default();
serde_json::json!({
"text": format!("*webclaw: {event}*\n```{details}```")
})
.to_string()
} else {
serde_json::to_string(payload).unwrap_or_default()
};
tokio::spawn(async move {
// SSRF guard: a webhook URL is user-supplied and otherwise bypasses
// the fetch-layer protections, so resolve + reject private/internal
// destinations before sending the payload.
if let Err(e) = webclaw_fetch::url_security::validate_public_http_url(&url).await {
eprintln!("[webhook] refusing unsafe URL: {e}");
return;
}
match reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
{
Ok(c) => match c
.post(&url)
.header("Content-Type", "application/json")
.body(body)
.send()
.await
{
Ok(resp) => {
eprintln!(
"[webhook] POST {} -> {}",
&url[..url.len().min(60)],
resp.status()
);
}
Err(e) => eprintln!("[webhook] POST failed: {e}"),
},
Err(e) => eprintln!("[webhook] client error: {e}"),
}
});
}

View file

@ -3,12 +3,16 @@ name = "webclaw-core"
description = "Pure HTML content extraction engine for LLMs"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
# Reddit regression fixtures are real old.reddit.com pages read at test time;
# they're large and only needed to run the test suite from the repo, so keep
# them out of the published crate.
exclude = ["testdata/reddit/*.html"]
[lints]
workspace = true
[features]
default = ["quickjs"]
quickjs = ["rquickjs"]

View file

@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum DomainType {
Article,
Documentation,

View file

@ -3,6 +3,7 @@
use thiserror::Error;
#[derive(Debug, Error)]
#[non_exhaustive]
pub enum ExtractError {
#[error("failed to parse HTML")]
ParseError,

View file

@ -16,6 +16,29 @@ static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);
/// Markers that, if absent from the HTML, prove the QuickJS scan cannot find
/// any data blob. The scan only ever surfaces `globalThis.__*` object/array
/// properties, and the seeded `__next_f` only emits when non-empty. Every
/// realistic way an inline script populates such a global goes through one of
/// these substrings (`window.`/`self.__next` assignments, or the
/// `__NEXT_DATA__`/`__NUXT__`/`application/json` payload conventions). If none
/// are present, running the VM is guaranteed to return zero blobs, so skipping
/// it is output-neutral. Conservative by design: any of these may appear in
/// non-script HTML too, which only makes us skip *less* often, never more.
const JS_CANDIDATE_MARKERS: [&str; 5] = [
"window.",
"__NEXT_DATA__",
"__NUXT__",
"application/json",
"self.__next",
];
/// Returns true if the HTML plausibly contains JS-assigned data the QuickJS
/// scan could surface. When false, the VM is provably a no-op and is skipped.
pub fn has_js_candidate_data(html: &str) -> bool {
JS_CANDIDATE_MARKERS.iter().any(|m| html.contains(m))
}
/// A blob of data extracted from JS execution.
pub struct JsDataBlob {
pub name: String,
@ -24,9 +47,17 @@ pub struct JsDataBlob {
}
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data.
///
/// Convenience wrapper that parses `html` first. Hot callers that already hold a
/// parsed [`Html`] should use [`extract_js_data_from_doc`] to avoid a second parse.
pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
let doc = Html::parse_document(html);
extract_js_data_from_doc(&doc)
}
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data,
/// reusing an already-parsed [`Html`] document instead of re-parsing the HTML.
pub fn extract_js_data_from_doc(doc: &Html) -> Vec<JsDataBlob> {
let scripts: Vec<String> = doc
.select(&SCRIPT_SELECTOR)
.filter(|el| {

View file

@ -1,10 +1,12 @@
//! webclaw-core: Pure HTML content extraction engine for LLMs.
//!
//! Takes raw HTML + optional URL, returns structured content
//! (metadata, markdown, plain text, links, images, code blocks).
//! Zero network dependencies — WASM-compatible by design.
#![forbid(unsafe_code)]
pub mod brand;
pub(crate) mod data_island;
/// webclaw-core: Pure HTML content extraction engine for LLMs.
///
/// Takes raw HTML + optional URL, returns structured content
/// (metadata, markdown, plain text, links, images, code blocks).
/// Zero network dependencies — WASM-compatible by design.
pub mod diff;
pub mod domain;
pub mod endpoints;
@ -38,6 +40,14 @@ use url::Url;
///
/// `html` — raw HTML string to parse
/// `url` — optional source URL, used for resolving relative links and domain detection
///
/// # Example
///
/// ```rust
/// let html = "<html><body><article><h1>Hello</h1><p>World</p></article></body></html>";
/// let result = webclaw_core::extract(html, Some("https://example.com")).unwrap();
/// assert!(result.content.markdown.contains("# Hello"));
/// ```
pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, ExtractError> {
extract_with_options(html, url, &ExtractionOptions::default())
}
@ -221,9 +231,14 @@ fn extract_with_options_inner(
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
// static JSON data island extraction above with runtime-evaluated data.
//
// Output-neutral fast path: the QuickJS scan can only ever surface
// `globalThis.__*` data, so when the HTML contains none of the candidate
// markers the VM is provably a no-op and is skipped entirely. We also reuse
// the already-parsed `doc` instead of re-parsing the HTML a second time.
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
{
let blobs = js_eval::extract_js_data(html);
if js_eval::has_js_candidate_data(html) {
let blobs = js_eval::extract_js_data_from_doc(&doc);
if !blobs.is_empty() {
let js_text = js_eval::extract_readable_text(&blobs);
if !js_text.is_empty() {

View file

@ -184,7 +184,7 @@ fn detect_long_line_cycle(words: &[&str]) -> Option<String> {
// Try exact N-copy cycles first
for n_copies in (2..=5).rev() {
if !slice.len().is_multiple_of(n_copies) {
if slice.len() % n_copies != 0 {
continue;
}
let cycle_len = slice.len() / n_copies;
@ -759,7 +759,7 @@ pub(crate) fn dedup_comma_lists(input: &str) -> String {
// First: try full cycle dedup (a,b,c,a,b,c -> a,b,c)
if items.len() >= 6 {
for cycle_len in 1..=items.len() / 2 {
if !items.len().is_multiple_of(cycle_len) {
if items.len() % cycle_len != 0 {
continue;
}
let pattern = &items[..cycle_len];

View file

@ -13,6 +13,8 @@ use crate::noise;
use crate::types::{CodeBlock, Image, Link};
static CODE_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("code").unwrap());
static IMG_ALT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("img[alt]").unwrap());
static A_HREF_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("a[href]").unwrap());
/// Maximum recursion depth for DOM traversal.
/// Express.co.uk live blogs and similar pages can nest 1000+ levels deep,
@ -853,7 +855,7 @@ fn collect_assets_from_noise(
assets: &mut ConvertedAssets,
) {
// Collect images with alt text
for img in element.select(&Selector::parse("img[alt]").unwrap()) {
for img in element.select(&IMG_ALT_SELECTOR) {
let alt = img.value().attr("alt").unwrap_or("").to_string();
let src = img
.value()
@ -866,7 +868,7 @@ fn collect_assets_from_noise(
}
// Collect links
for link in element.select(&Selector::parse("a[href]").unwrap()) {
for link in element.select(&A_HREF_SELECTOR) {
let href = link
.value()
.attr("href")

View file

@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize};
use crate::domain::DomainType;
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct ExtractionResult {
pub metadata: Metadata,
pub content: Content,
@ -15,7 +16,38 @@ pub struct ExtractionResult {
pub structured_data: Vec<serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
impl ExtractionResult {
/// Construct a result from metadata and content, defaulting
/// `domain_data` to `None` and `structured_data` to empty.
///
/// `ExtractionResult` is `#[non_exhaustive]`, so downstream crates must
/// build it through this constructor instead of a struct literal.
pub fn new(metadata: Metadata, content: Content) -> Self {
Self {
metadata,
content,
domain_data: None,
structured_data: Vec::new(),
}
}
/// Attach domain-specific data.
#[must_use]
pub fn with_domain_data(mut self, domain_data: Option<DomainData>) -> Self {
self.domain_data = domain_data;
self
}
/// Attach JSON-LD structured data blocks.
#[must_use]
pub fn with_structured_data(mut self, structured_data: Vec<serde_json::Value>) -> Self {
self.structured_data = structured_data;
self
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[non_exhaustive]
pub struct Metadata {
pub title: Option<String>,
pub description: Option<String>,
@ -29,7 +61,73 @@ pub struct Metadata {
pub word_count: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
impl Metadata {
/// Start from an all-default `Metadata`. `Metadata` is `#[non_exhaustive]`,
/// so downstream crates build it via `Metadata::default()` plus the
/// `with_*` setters rather than a struct literal.
#[must_use]
pub fn with_title(mut self, title: Option<String>) -> Self {
self.title = title;
self
}
#[must_use]
pub fn with_description(mut self, description: Option<String>) -> Self {
self.description = description;
self
}
#[must_use]
pub fn with_author(mut self, author: Option<String>) -> Self {
self.author = author;
self
}
#[must_use]
pub fn with_published_date(mut self, published_date: Option<String>) -> Self {
self.published_date = published_date;
self
}
#[must_use]
pub fn with_language(mut self, language: Option<String>) -> Self {
self.language = language;
self
}
#[must_use]
pub fn with_url(mut self, url: Option<String>) -> Self {
self.url = url;
self
}
#[must_use]
pub fn with_site_name(mut self, site_name: Option<String>) -> Self {
self.site_name = site_name;
self
}
#[must_use]
pub fn with_image(mut self, image: Option<String>) -> Self {
self.image = image;
self
}
#[must_use]
pub fn with_favicon(mut self, favicon: Option<String>) -> Self {
self.favicon = favicon;
self
}
#[must_use]
pub fn with_word_count(mut self, word_count: usize) -> Self {
self.word_count = word_count;
self
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[non_exhaustive]
pub struct Content {
pub markdown: String,
pub plain_text: String,
@ -40,6 +138,47 @@ pub struct Content {
pub raw_html: Option<String>,
}
impl Content {
/// Start from an all-default `Content`. `Content` is `#[non_exhaustive]`,
/// so downstream crates build it via `Content::default()` plus the
/// `with_*` setters rather than a struct literal.
#[must_use]
pub fn with_markdown(mut self, markdown: String) -> Self {
self.markdown = markdown;
self
}
#[must_use]
pub fn with_plain_text(mut self, plain_text: String) -> Self {
self.plain_text = plain_text;
self
}
#[must_use]
pub fn with_links(mut self, links: Vec<Link>) -> Self {
self.links = links;
self
}
#[must_use]
pub fn with_images(mut self, images: Vec<Image>) -> Self {
self.images = images;
self
}
#[must_use]
pub fn with_code_blocks(mut self, code_blocks: Vec<CodeBlock>) -> Self {
self.code_blocks = code_blocks;
self
}
#[must_use]
pub fn with_raw_html(mut self, raw_html: Option<String>) -> Self {
self.raw_html = raw_html;
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct Link {
pub text: String,

View file

@ -3,8 +3,12 @@ name = "webclaw-fetch"
description = "HTTP client with browser TLS fingerprint impersonation via wreq"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
[lints]
workspace = true
[dependencies]
webclaw-core = { workspace = true }
webclaw-pdf = { path = "../webclaw-pdf" }

View file

@ -3,13 +3,13 @@
/// Which browser identity to present at the TLS/HTTP layer.
#[derive(Debug, Clone, Default)]
#[non_exhaustive]
pub enum BrowserProfile {
#[default]
Chrome,
Firefox,
/// Safari iOS 26 (iPhone). The one profile proven to defeat
/// DataDome's immobiliare.it / idealista.it / target.com-class
/// rules when paired with a country-scoped residential proxy.
/// iOS Safari fingerprint. Useful for sites with stricter TLS
/// requirements that expect a mobile Safari client.
SafariIos,
/// Randomly pick from all available profiles on each request.
Random,

View file

@ -168,6 +168,13 @@ impl Response {
fn into_text(self) -> String {
String::from_utf8_lossy(&self.body).into_owned()
}
/// Consume the response and hand back the owned body buffer. Used by
/// the PDF path to move the bytes into `spawn_blocking` without copying
/// (`Bytes` is a refcounted buffer, so this is a cheap move).
fn into_body(self) -> bytes::Bytes {
self.body
}
}
/// Internal representation of the client pool strategy.
@ -330,6 +337,18 @@ impl FetchClient {
/// rescue logic; use [`Self::fetch_smart`] for that.
#[instrument(skip(self), fields(url = %url))]
pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
self.with_retry(url, || self.fetch_once(url)).await
}
/// Shared retry loop for the public `fetch` / `fetch_with_headers`
/// entry points. Runs `attempt` with exponential backoff (0s, 1s —
/// 2 attempts total), retrying on transient network errors and
/// retryable HTTP statuses (5xx, 429). `url` is for logging only.
async fn with_retry<F, Fut>(&self, url: &str, attempt_fn: F) -> Result<FetchResult, FetchError>
where
F: Fn() -> Fut,
Fut: std::future::Future<Output = Result<FetchResult, FetchError>>,
{
let delays = [Duration::ZERO, Duration::from_secs(1)];
let mut last_err = None;
@ -338,7 +357,7 @@ impl FetchClient {
tokio::time::sleep(*delay).await;
}
match self.fetch_once(url).await {
match attempt_fn().await {
Ok(result) => {
if is_retryable_status(result.status) && attempt < delays.len() - 1 {
warn!(
@ -414,46 +433,8 @@ impl FetchClient {
url: &str,
extra: &[(&str, &str)],
) -> Result<FetchResult, FetchError> {
let delays = [Duration::ZERO, Duration::from_secs(1)];
let mut last_err = None;
for (attempt, delay) in delays.iter().enumerate() {
if attempt > 0 {
tokio::time::sleep(*delay).await;
}
match self.fetch_once_with_headers(url, extra).await {
Ok(result) => {
if is_retryable_status(result.status) && attempt < delays.len() - 1 {
warn!(
url,
status = result.status,
attempt = attempt + 1,
"retryable status, will retry"
);
last_err = Some(FetchError::Build(format!("HTTP {}", result.status)));
continue;
}
if attempt > 0 {
debug!(url, attempt = attempt + 1, "retry succeeded");
}
return Ok(result);
}
Err(e) => {
if !is_retryable_error(&e) || attempt == delays.len() - 1 {
return Err(e);
}
warn!(
url,
error = %e,
attempt = attempt + 1,
"transient error, will retry"
);
last_err = Some(e);
}
}
}
Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
self.with_retry(url, || self.fetch_once_with_headers(url, extra))
.await
}
/// Fetch a URL then extract structured content.
@ -514,17 +495,24 @@ impl FetchClient {
if is_pdf {
debug!(status, "detected PDF response, using pdf extraction");
let bytes = response.body();
let bytes = response.into_body();
let byte_len = bytes.len();
let elapsed = start.elapsed();
debug!(
status,
bytes = bytes.len(),
bytes = byte_len,
elapsed_ms = %elapsed.as_millis(),
"PDF fetch complete"
);
let pdf_result = webclaw_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
// pdf-extract is synchronous and CPU-bound; run it off the async
// executor so a large PDF doesn't stall the reactor thread.
let pdf_mode = self.pdf_mode.clone();
let pdf_result =
tokio::task::spawn_blocking(move || webclaw_pdf::extract_pdf(&bytes, pdf_mode))
.await
.map_err(|e| FetchError::Build(format!("pdf extraction task failed: {e}")))??;
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
} else if let Some(doc_type) =
crate::document::is_document_content_type(&headers, &final_url)
@ -814,30 +802,16 @@ fn pdf_to_extraction_result(
let markdown = webclaw_pdf::to_markdown(pdf);
let word_count = markdown.split_whitespace().count();
webclaw_core::ExtractionResult {
metadata: webclaw_core::Metadata {
title: pdf.metadata.title.clone(),
description: pdf.metadata.subject.clone(),
author: pdf.metadata.author.clone(),
published_date: None,
language: None,
url: Some(url.to_string()),
site_name: None,
image: None,
favicon: None,
word_count,
},
content: webclaw_core::Content {
markdown,
plain_text: pdf.text.clone(),
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
domain_data: None,
structured_data: vec![],
}
let metadata = webclaw_core::Metadata::default()
.with_title(pdf.metadata.title.clone())
.with_description(pdf.metadata.subject.clone())
.with_author(pdf.metadata.author.clone())
.with_url(Some(url.to_string()))
.with_word_count(word_count);
let content = webclaw_core::Content::default()
.with_markdown(markdown)
.with_plain_text(pdf.text.clone());
webclaw_core::ExtractionResult::new(metadata, content)
}
/// Collect spawned tasks and reorder results to match input order.

View file

@ -93,6 +93,7 @@ const KEYS_URL: &str = "https://webclaw.io/dashboard/api-keys";
/// Display messages end with an actionable URL so API consumers can
/// surface them to users verbatim.
#[derive(Debug, Error)]
#[non_exhaustive]
pub enum CloudError {
/// No `WEBCLAW_API_KEY` configured. Returned by [`smart_fetch_html`]
/// and friends when they hit bot protection but have no client to

View file

@ -98,30 +98,11 @@ pub fn extract_document(
let plain_text = strip_markdown_formatting(&markdown);
let word_count = plain_text.split_whitespace().count();
Ok(webclaw_core::ExtractionResult {
metadata: webclaw_core::Metadata {
title: None,
description: None,
author: None,
published_date: None,
language: None,
url: None,
site_name: None,
image: None,
favicon: None,
word_count,
},
content: webclaw_core::Content {
markdown,
plain_text,
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
domain_data: None,
structured_data: vec![],
})
let metadata = webclaw_core::Metadata::default().with_word_count(word_count);
let content = webclaw_core::Content::default()
.with_markdown(markdown)
.with_plain_text(plain_text);
Ok(webclaw_core::ExtractionResult::new(metadata, content))
}
/// Extract text from a DOCX file (ZIP of XML).

View file

@ -3,6 +3,7 @@
use thiserror::Error;
#[derive(Debug, Error)]
#[non_exhaustive]
pub enum FetchError {
#[error("request failed: {0}")]
Request(#[from] wreq::Error),

View file

@ -33,6 +33,7 @@ use serde_json::{Value, json};
use url::Url;
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -115,23 +116,25 @@ pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchErro
/// without carrying webclaw_fetch types.
pub fn parse(html: &str, url: &str, asin: &str) -> Value {
let jsonld = find_product_jsonld(html);
// Single scan for the og:* fallbacks read below.
let og_meta = parse_og(html);
// Three-tier title: JSON-LD `name` > Amazon's `#productTitle` span
// (only present on real static HTML) > cloud-synthesized og:title.
let title = jsonld
.as_ref()
.and_then(|v| get_text(v, "name"))
.or_else(|| dom_title(html))
.or_else(|| og(html, "title"));
.or_else(|| og_meta.unescaped("title"));
let image = jsonld
.as_ref()
.and_then(get_first_image)
.or_else(|| dom_image(html))
.or_else(|| og(html, "image"));
.or_else(|| og_meta.unescaped("image"));
let brand = jsonld.as_ref().and_then(get_brand);
let description = jsonld
.as_ref()
.and_then(|v| get_text(v, "description"))
.or_else(|| og(html, "description"));
.or_else(|| og_meta.unescaped("description"));
let aggregate_rating = jsonld.as_ref().and_then(get_aggregate_rating);
let offer = jsonld.as_ref().and_then(first_offer);
@ -336,31 +339,6 @@ fn dom_image(html: &str) -> Option<String> {
.map(|m| m.as_str().to_string())
}
/// OG meta tag lookup. Cloud-synthesized HTML ships these even when
/// JSON-LD and Amazon-DOM-IDs are both absent, so they're the last
/// line of defence for `title`, `image`, `description`.
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| html_unescape(m.as_str()));
}
}
None
}
/// Undo the synthesize_html attribute escaping for the few entities it
/// emits. Keeps us off a heavier HTML-entity dep.
fn html_unescape(s: &str) -> String {
s.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
}
fn cloud_to_fetch_err(e: CloudError) -> FetchError {
FetchError::Build(e.to_string())
}
@ -477,7 +455,7 @@ mod tests {
fn og_unescape_handles_quot_entity() {
let html = r#"<meta property="og:title" content="Apple &quot;M2 Pro&quot; Laptop">"#;
assert_eq!(
og(html, "title").as_deref(),
parse_og(html).unescaped("title").as_deref(),
Some(r#"Apple "M2 Pro" Laptop"#)
);
}

View file

@ -15,6 +15,7 @@ use serde_json::{Value, json};
use url::Url;
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -65,19 +66,21 @@ pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchErro
pub fn parse(html: &str, url: &str, item_id: &str) -> Value {
let jsonld = find_product_jsonld(html);
// Single scan for the three og:* fields read as fallbacks below.
let og_meta = parse_og(html);
let title = jsonld
.as_ref()
.and_then(|v| get_text(v, "name"))
.or_else(|| og(html, "title"));
.or_else(|| og_meta.raw("title"));
let image = jsonld
.as_ref()
.and_then(get_first_image)
.or_else(|| og(html, "image"));
.or_else(|| og_meta.raw("image"));
let brand = jsonld.as_ref().and_then(get_brand);
let description = jsonld
.as_ref()
.and_then(|v| get_text(v, "description"))
.or_else(|| og(html, "description"));
.or_else(|| og_meta.raw("description"));
let offer = jsonld.as_ref().and_then(first_offer);
// eBay's AggregateOffer uses lowPrice/highPrice. Offer uses price.
@ -268,19 +271,6 @@ fn get_aggregate_rating(v: &Value) -> Option<Value> {
}))
}
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
fn cloud_to_fetch_err(e: CloudError) -> FetchError {
FetchError::Build(e.to_string())
}

View file

@ -42,6 +42,7 @@ use regex::Regex;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::{og, parse_og};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -142,15 +143,17 @@ fn build_jsonld_payload(product: &Value, html: &str, url: &str) -> Value {
/// Build a minimal payload from OG / product meta tags. Used when a
/// page has no Product JSON-LD at all.
fn build_og_payload(html: &str, url: &str) -> Value {
// Single scan for the three og:* fields this fallback reads.
let og_meta = parse_og(html);
let offers = build_og_offer(html).map(|o| vec![o]).unwrap_or_default();
let image = og(html, "image");
let image = og_meta.raw("image");
let images: Vec<Value> = image.map(|i| vec![Value::String(i)]).unwrap_or_default();
json!({
"url": url,
"data_source": "og_fallback",
"name": og(html, "title"),
"description": og(html, "description"),
"name": og_meta.raw("title"),
"description": og_meta.raw("description"),
"brand": meta_property(html, "product:brand"),
"sku": None::<String>,
"mpn": None::<String>,
@ -368,20 +371,6 @@ fn build_og_offer(html: &str) -> Option<Value> {
}))
}
/// Pull the value of `<meta property="og:{prop}" content="...">`.
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
/// Pull the value of any `<meta property="..." content="...">` tag.
/// Needed for namespaced OG variants like `product:price:amount` that
/// the simple `og:*` matcher above doesn't cover.

View file

@ -26,6 +26,7 @@ use regex::Regex;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -74,19 +75,26 @@ pub fn parse(html: &str, url: &str, listing_id: &str) -> Value {
let jsonld = find_product_jsonld(html);
let slug_title = humanise_slug(parse_slug(url).as_deref());
// Single scan for the three og:* fields used as fallbacks below.
let og_meta = parse_og(html);
let title = jsonld
.as_ref()
.and_then(|v| get_text(v, "name"))
.or_else(|| og(html, "title").filter(|t| !is_generic_title(t)))
.or_else(|| og_meta.raw("title").filter(|t| !is_generic_title(t)))
.or(slug_title);
let description = jsonld
.as_ref()
.and_then(|v| get_text(v, "description"))
.or_else(|| og(html, "description").filter(|d| !is_generic_description(d)));
.or_else(|| {
og_meta
.raw("description")
.filter(|d| !is_generic_description(d))
});
let image = jsonld
.as_ref()
.and_then(get_first_image)
.or_else(|| og(html, "image"));
.or_else(|| og_meta.raw("image"));
let brand = jsonld.as_ref().and_then(get_brand);
// Etsy listings often ship either a single Offer or an
@ -359,19 +367,6 @@ fn strip_schema_prefix(s: String) -> String {
.replace("https://schema.org/", "")
}
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
/// Etsy links the owning shop with a canonical anchor like
/// `<a href="/shop/ShopName" ...>`. Grab the first one after the
/// breadcrumb boundary.

View file

@ -33,6 +33,7 @@ pub mod instagram_post;
pub mod instagram_profile;
pub mod linkedin_post;
pub mod npm;
pub(crate) mod og;
pub mod pypi;
pub mod reddit;
pub mod shopify_collection;

View file

@ -0,0 +1,79 @@
//! Shared Open Graph (`og:*`) meta-tag parsing for the HTML vertical
//! extractors.
//!
//! Several site extractors read a handful of `og:*` properties (title,
//! description, image, ...) from the page `<head>`. Each used to carry a
//! verbatim copy of the same regex + scan helper. This module centralises
//! that logic and adds [`parse_og`], which collects every `og:*` pair in a
//! single `captures_iter` pass so an extractor that needs multiple fields
//! scans the document once instead of once per field.
//!
//! Values are stored raw. Callers that need HTML entity decoding apply
//! [`html_unescape`] themselves — some extractors intentionally keep the
//! raw value, so decoding is opt-in per call site to preserve output.
use std::collections::HashMap;
use std::sync::OnceLock;
use regex::Regex;
/// Matches `<meta property="og:<name>" content="<value>">`, case-insensitive.
/// Capture 1 is the property suffix (after `og:`), capture 2 is the content.
fn og_regex() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
})
}
/// Return the raw content of the first `og:<prop>` meta tag, if present.
///
/// Single-pass per call. For extractors reading several properties, prefer
/// [`parse_og`] to scan the document only once.
pub(crate) fn og(html: &str, prop: &str) -> Option<String> {
for c in og_regex().captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
/// Parse every `og:*` meta tag in one pass into a `suffix -> content` map.
///
/// First occurrence wins, matching the short-circuit-on-first-match
/// behaviour of [`og`] when called per property. Values are raw (not
/// entity-decoded); use [`OgMeta::unescaped`] / [`OgMeta::raw`] to read.
pub(crate) fn parse_og(html: &str) -> OgMeta {
let mut map: HashMap<String, String> = HashMap::new();
for c in og_regex().captures_iter(html) {
if let (Some(name), Some(content)) = (c.get(1), c.get(2)) {
map.entry(name.as_str().to_string())
.or_insert_with(|| content.as_str().to_string());
}
}
OgMeta(map)
}
/// Parsed `og:*` properties from a single document scan.
pub(crate) struct OgMeta(HashMap<String, String>);
impl OgMeta {
/// Raw content of `og:<prop>`, exactly as it appeared in the HTML.
pub(crate) fn raw(&self, prop: &str) -> Option<String> {
self.0.get(prop).cloned()
}
/// Content of `og:<prop>` with the common HTML entities decoded.
pub(crate) fn unescaped(&self, prop: &str) -> Option<String> {
self.0.get(prop).map(|v| html_unescape(v))
}
}
/// Decode the small set of HTML entities that show up in `og:*` content.
pub(crate) fn html_unescape(s: &str) -> String {
s.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
}

View file

@ -28,6 +28,7 @@ use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -181,24 +182,27 @@ async fn html_fallback(
pub fn parse_html(html: &str, url: &str, api_url: &str, slug: &str) -> Value {
let article = find_article_jsonld(html);
// Single scan for the four og:* fields read as fallbacks below.
let og_meta = parse_og(html);
let title = article
.as_ref()
.and_then(|v| get_text(v, "headline"))
.or_else(|| og(html, "title"));
.or_else(|| og_meta.raw("title"));
let description = article
.as_ref()
.and_then(|v| get_text(v, "description"))
.or_else(|| og(html, "description"));
.or_else(|| og_meta.raw("description"));
let cover_image = article
.as_ref()
.and_then(get_first_image)
.or_else(|| og(html, "image"));
.or_else(|| og_meta.raw("image"));
let post_date = article
.as_ref()
.and_then(|v| get_text(v, "datePublished"))
.or_else(|| meta_property(html, "article:published_time"));
let updated_at = article.as_ref().and_then(|v| get_text(v, "dateModified"));
let publication_name = og(html, "site_name");
let publication_name = og_meta.raw("site_name");
let authors = article.as_ref().map(extract_authors).unwrap_or_default();
json!({
@ -302,19 +306,6 @@ fn handle_from_author_url(u: &str) -> Option<String> {
// HTML tag helpers
// ---------------------------------------------------------------------------
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
/// Pull `<meta property="article:published_time" content="...">` and
/// similar structured meta tags.
fn meta_property(html: &str, prop: &str) -> Option<String> {

View file

@ -32,6 +32,7 @@ use regex::Regex;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::parse_og;
use crate::cloud::{self, CloudError};
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -87,11 +88,17 @@ pub fn parse(html: &str, url: &str) -> Result<Value, FetchError> {
// The aiSummary block: not typed (no `@type`), detect by key.
let ai_block = find_ai_summary_block(&blocks);
// Single scan of the page's og:* meta tags; title + description feed
// the regex fallbacks below.
let og_meta = parse_og(html);
let og_title = og_meta.unescaped("title");
let og_description = og_meta.unescaped("description");
// Business name: Dataset > metadata.title regex > URL domain.
let business_name = dataset
.as_ref()
.and_then(|d| get_string(d, "name"))
.or_else(|| parse_name_from_og_title(html))
.or_else(|| parse_name_from_og_title(og_title.as_deref()))
.or_else(|| Some(domain.clone()));
// Rating distribution from the csvw:Table columns. Each column has
@ -105,8 +112,8 @@ pub fn parse(html: &str, url: &str) -> Result<Value, FetchError> {
// Page-title / page-description fallbacks. OG title format:
// "Anthropic is rated \"Bad\" with 1.5 / 5 on Trustpilot"
let (rating_label, rating_from_og) = parse_rating_from_og_title(html);
let total_from_desc = parse_review_count_from_og_description(html);
let (rating_label, rating_from_og) = parse_rating_from_og_title(og_title.as_deref());
let total_from_desc = parse_review_count_from_og_description(og_description.as_deref());
// Recent reviews carried by the aiSummary block.
let recent_reviews: Vec<Value> = ai_block
@ -336,20 +343,21 @@ fn compute_rating_stats(distribution: &Value) -> (Option<String>, Option<i64>) {
/// Regex out the business name from the standard Trustpilot OG title
/// shape: `"{name} is rated \"{label}\" with {rating} / 5 on Trustpilot"`.
fn parse_name_from_og_title(html: &str) -> Option<String> {
let title = og(html, "title")?;
/// `title` is the (entity-decoded) `og:title` content.
fn parse_name_from_og_title(title: Option<&str>) -> Option<String> {
let title = title?;
// "Anthropic is rated \"Bad\" with 1.5 / 5 on Trustpilot"
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| Regex::new(r"^(.+?)\s+is rated\b").unwrap());
re.captures(&title)
re.captures(title)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
}
/// Pull the rating label (e.g. "Bad", "Excellent") and numeric value
/// from the OG title.
fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
let Some(title) = og(html, "title") else {
/// from the (entity-decoded) `og:title` content.
fn parse_rating_from_og_title(title: Option<&str>) -> (Option<String>, Option<String>) {
let Some(title) = title else {
return (None, None);
};
static RE: OnceLock<Regex> = OnceLock::new();
@ -357,7 +365,7 @@ fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
let re = RE.get_or_init(|| {
Regex::new(r#"is rated\s*[\\"]+([^"\\]+)[\\"]+\s*with\s*([\d.]+)\s*/\s*5"#).unwrap()
});
let Some(caps) = re.captures(&title) else {
let Some(caps) = re.captures(title) else {
return (None, None);
};
(
@ -366,13 +374,13 @@ fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
)
}
/// Parse "hear what 226 customers have already said" from the OG
/// description tag.
fn parse_review_count_from_og_description(html: &str) -> Option<i64> {
let desc = og(html, "description")?;
/// Parse "hear what 226 customers have already said" from the
/// (entity-decoded) `og:description` content.
fn parse_review_count_from_og_description(desc: Option<&str>) -> Option<i64> {
let desc = desc?;
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| Regex::new(r"(\d[\d,]*)\s+customers").unwrap());
re.captures(&desc)?
re.captures(desc)?
.get(1)?
.as_str()
.replace(',', "")
@ -380,29 +388,6 @@ fn parse_review_count_from_og_description(html: &str) -> Option<i64> {
.ok()
}
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
let raw = c.get(2).map(|m| m.as_str())?;
return Some(html_unescape(raw));
}
}
None
}
/// Minimal HTML entity unescaping for the three entities the
/// synthesize_html escaper might produce. Keeps us off a heavier dep.
fn html_unescape(s: &str) -> String {
s.replace("&quot;", "\"")
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
}
fn get_string(v: &Value, key: &str) -> Option<String> {
v.get(key).and_then(|x| x.as_str().map(String::from))
}
@ -488,8 +473,12 @@ mod tests {
#[test]
fn parse_og_title_extracts_name_and_rating() {
let html = r#"<meta property="og:title" content="Anthropic is rated &quot;Bad&quot; with 1.5 / 5 on Trustpilot">"#;
assert_eq!(parse_name_from_og_title(html), Some("Anthropic".into()));
let (label, rating) = parse_rating_from_og_title(html);
let title = parse_og(html).unescaped("title");
assert_eq!(
parse_name_from_og_title(title.as_deref()),
Some("Anthropic".into())
);
let (label, rating) = parse_rating_from_og_title(title.as_deref());
assert_eq!(label.as_deref(), Some("Bad"));
assert_eq!(rating.as_deref(), Some("1.5"));
}
@ -497,7 +486,11 @@ mod tests {
#[test]
fn parse_review_count_from_og_description_picks_number() {
let html = r#"<meta property="og:description" content="Do you agree? Voice your opinion today and hear what 226 customers have already said.">"#;
assert_eq!(parse_review_count_from_og_description(html), Some(226));
let desc = parse_og(html).unescaped("description");
assert_eq!(
parse_review_count_from_og_description(desc.as_deref()),
Some(226)
);
}
#[test]

View file

@ -25,6 +25,7 @@ use regex::Regex;
use serde_json::{Value, json};
use super::ExtractorInfo;
use super::og::parse_og;
use crate::error::FetchError;
use crate::fetcher::Fetcher;
@ -143,9 +144,11 @@ fn build_player_payload(
// ---------------------------------------------------------------------------
fn build_og_fallback(html: &str, url: &str, canonical: &str, video_id: &str) -> Value {
let title = og(html, "title");
let description = og(html, "description");
let thumbnail = og(html, "image");
// Single scan for the three og:* fields read below.
let og_meta = parse_og(html);
let title = og_meta.raw("title");
let description = og_meta.raw("description");
let thumbnail = og_meta.raw("image");
// YouTube sets `<meta name="channel_name" ...>` on some pages but
// OG-only pages reliably carry `og:video:tag` and the channel in
// `<link itemprop="name">`. We keep this lean: just what's stable.
@ -248,19 +251,6 @@ fn extract_player_response(html: &str) -> Option<Value> {
// Meta-tag helpers (for OG fallback)
// ---------------------------------------------------------------------------
fn og(html: &str, prop: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
});
for c in re.captures_iter(html) {
if c.get(1).is_some_and(|m| m.as_str() == prop) {
return c.get(2).map(|m| m.as_str().to_string());
}
}
None
}
fn meta_name(html: &str, name: &str) -> Option<String> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| {

View file

@ -1,13 +1,14 @@
//! Pluggable fetcher abstraction for vertical extractors.
//!
//! Extractors call the network through this trait instead of hard-
//! coding [`FetchClient`]. The OSS CLI / MCP / self-hosted server all
//! pass `&FetchClient` (wreq-backed BoringSSL). The production API
//! server, which must not use in-process TLS fingerprinting, provides
//! its own implementation that routes through the Go tls-sidecar.
//! coding [`FetchClient`]. The CLI / MCP / self-hosted server all pass
//! `&FetchClient`, which fetches in-process via wreq (BoringSSL) with
//! browser-grade TLS fingerprinting. Deployments that need different
//! transport behaviour can supply an alternative [`Fetcher`]
//! implementation instead.
//!
//! Both paths expose the same [`FetchResult`] shape and the same
//! optional cloud-escalation client, so extractor logic stays
//! Every implementation exposes the same [`FetchResult`] shape and the
//! same optional cloud-escalation client, so extractor logic stays
//! identical across environments.
//!
//! ## Choosing an implementation
@ -15,9 +16,9 @@
//! - CLI, MCP, self-hosted `webclaw-server`: build a [`FetchClient`]
//! with [`FetchClient::with_cloud`] to attach cloud fallback, pass
//! it to extractors as `&client`.
//! - `api.webclaw.io` production server: build a `TlsSidecarFetcher`
//! (in `server/src/engine/`) that delegates to `engine::tls_client`
//! and wraps it in `Arc<dyn Fetcher>` for handler injection.
//! - Custom deployments: provide any type implementing [`Fetcher`],
//! wrapped in `Arc<dyn Fetcher>` for handler injection, to layer in
//! environment-specific routing on top of the same extractor logic.
//!
//! ## Why a trait and not a free function
//!

View file

@ -196,38 +196,24 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult>
"linkedin extraction done"
);
Some(ExtractionResult {
metadata: Metadata {
title: if post_author.is_empty() {
None
} else {
Some(format!("{post_author}'s LinkedIn Post"))
},
description: None,
author: if post_author.is_empty() {
None
} else {
Some(post_author)
},
published_date: None,
language: None,
url: Some(url.to_string()),
site_name: Some("LinkedIn".into()),
image: None,
favicon: None,
word_count,
},
content: Content {
markdown,
plain_text: String::new(),
links: vec![],
images: vec![],
code_blocks: vec![],
raw_html: None,
},
domain_data: None,
structured_data: vec![],
})
let title = if post_author.is_empty() {
None
} else {
Some(format!("{post_author}'s LinkedIn Post"))
};
let author = if post_author.is_empty() {
None
} else {
Some(post_author)
};
let metadata = Metadata::default()
.with_title(title)
.with_author(author)
.with_url(Some(url.to_string()))
.with_site_name(Some("LinkedIn".into()))
.with_word_count(word_count);
let content = Content::default().with_markdown(markdown);
Some(ExtractionResult::new(metadata, content))
}
/// Unescape HTML entities (named + numeric decimal).

View file

@ -1,9 +1,9 @@
//! Derive an `Accept-Language` header from a URL.
//!
//! DataDome-class bot detection on country-specific sites (e.g. immobiliare.it,
//! leboncoin.fr) does a geo-vs-locale sanity check: residential IP in the
//! target country + a browser UA but the wrong `Accept-Language` is a bot
//! signal. Matching the site's expected locale gets us through.
//! Some bot-detection systems on country-specific sites do a geo-vs-locale
//! sanity check: an IP in the target country + a browser UA but the wrong
//! `Accept-Language` is a bot signal. Matching the site's expected locale
//! avoids that mismatch.
//!
//! Default for unmapped TLDs is `en-US,en;q=0.9` — the global fallback.
@ -53,15 +53,15 @@ mod tests {
#[test]
fn tld_dispatch() {
assert_eq!(
accept_language_for_url("https://www.immobiliare.it/annunci/1"),
accept_language_for_url("https://www.example.it/page/1"),
Some("it-IT,it;q=0.9")
);
assert_eq!(
accept_language_for_url("https://www.leboncoin.fr/"),
accept_language_for_url("https://www.example.fr/"),
Some("fr-FR,fr;q=0.9")
);
assert_eq!(
accept_language_for_url("https://www.amazon.co.uk/"),
accept_language_for_url("https://www.example.co.uk/"),
Some("en-GB,en;q=0.9")
);
assert_eq!(

View file

@ -597,7 +597,7 @@ mod tests {
"#;
let entries = parse_sitemap_xml(xml);
// Should return at least the successfully parsed entry
assert!(entries.len() >= 1);
assert!(!entries.is_empty());
assert_eq!(entries[0].url, "https://example.com/good");
}

View file

@ -81,10 +81,10 @@ const SAFARI_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkc
/// Safari curves.
const SAFARI_CURVES: &str = "X25519:P-256:P-384:P-521";
/// Safari iOS 26 TLS extension order, matching bogdanfinn's
/// `safari_ios_26_0` wire format. GREASE slots are omitted. wreq
/// inserts them itself. Diverges from wreq-util's default SafariIos26
/// extension order, which DataDome's immobiliare.it ruleset flags.
/// Safari iOS 26 TLS extension order, matching a real Safari iOS 26
/// handshake. GREASE slots are omitted; the TLS layer inserts them
/// itself. Diverges from the library default extension order, which
/// some strict TLS-fingerprinting WAFs flag.
fn safari_ios_extensions() -> Vec<ExtensionType> {
vec![
ExtensionType::CERTIFICATE_TIMESTAMP,
@ -103,12 +103,10 @@ fn safari_ios_extensions() -> Vec<ExtensionType> {
]
}
/// Chrome 133 TLS extension order, matching bogdanfinn's stable JA3
/// (`43067709b025da334de1279a120f8e14`). Real Chrome permutes extensions
/// per handshake, but indeed.com's WAF allowlists this specific wire order
/// and rejects permuted ones. GREASE slots are inserted by wreq.
///
/// JA3 extension field from peet.ws: 18-5-35-51-10-45-11-27-17613-43-13-0-16-65037-65281-23
/// Chrome 133 TLS extension order, matching a real Chrome 133 handshake.
/// Real Chrome permutes extensions per handshake, but some WAFs allowlist
/// one specific real-browser wire order and reject permuted ones. GREASE
/// slots are inserted by the TLS layer.
fn chrome_extensions() -> Vec<ExtensionType> {
vec![
ExtensionType::CERTIFICATE_TIMESTAMP, // 18
@ -220,9 +218,8 @@ const SAFARI_HEADERS: &[(&str, &str)] = &[
/// Safari iOS 26 headers, in the wire order real Safari emits. Critically:
/// NO `sec-fetch-*`, NO `priority: u=0, i` (both Chromium-only leaks), but
/// `upgrade-insecure-requests: 1` is present. `accept-encoding` does not
/// include zstd (Safari can't decode it). Verified against bogdanfinn on
/// 2026-04-22: this header set is what DataDome's immobiliare ruleset
/// expects for a real iPhone.
/// include zstd (Safari can't decode it). This header set matches what a
/// real iPhone emits.
const SAFARI_IOS_HEADERS: &[(&str, &str)] = &[
(
"accept",
@ -264,8 +261,8 @@ const EDGE_HEADERS: &[(&str, &str)] = &[
fn chrome_tls() -> TlsOptions {
// permute_extensions is off so the explicit extension_permutation sticks.
// Real Chrome permutes, but indeed.com's WAF allowlists bogdanfinn's
// fixed order, so matching that gets us through.
// Real Chrome permutes, but some WAFs allowlist one fixed real-browser
// order, so matching that order is what passes.
TlsOptions::builder()
.cipher_list(CHROME_CIPHERS)
.sigalgs_list(CHROME_SIGALGS)
@ -330,18 +327,15 @@ fn safari_tls() -> TlsOptions {
/// Safari iOS 26 emulation — composed on top of `wreq_util::Emulation::SafariIos26`
/// with four targeted overrides. We don't hand-roll this one like Chrome/Firefox
/// because the wire-level defaults from wreq-util are already correct for ciphers,
/// sigalgs, curves, and GREASE — the four things wreq-util gets *wrong* for
/// DataDome compatibility are overridden here:
/// because the wire-level library defaults are already correct for ciphers,
/// sigalgs, curves, and GREASE — the four things the library default gets
/// *wrong* for strict-WAF compatibility are overridden here:
///
/// 1. TLS extension order: match bogdanfinn `safari_ios_26_0` exactly (JA3
/// ends up `8d909525bd5bbb79f133d11cc05159fe`).
/// 1. TLS extension order: match a real Safari iOS 26 handshake exactly.
/// 2. HTTP/2 HEADERS priority flag: weight=256, exclusive=1, depends_on=0.
/// wreq-util omits this frame; real Safari and bogdanfinn include it.
/// This flip is the thing DataDome actually reads — the akamai_fingerprint
/// hash changes from `c52879e43202aeb92740be6e8c86ea96` to
/// `d1294410a06522e37a5c5e3f0a45a705`, which is the winning signature.
/// 3. Headers: strip wreq-util's Chromium defaults (`sec-fetch-*`,
/// The library default omits this frame; real Safari includes it. It is
/// part of the HTTP/2 fingerprint that strict WAFs inspect.
/// 3. Headers: strip the library's Chromium defaults (`sec-fetch-*`,
/// `priority: u=0, i`, zstd), replace with the real iOS 26 set.
/// 4. `accept-language` preserved from config.extra_headers for locale.
fn safari_ios_emulation() -> wreq::Emulation {
@ -354,7 +348,7 @@ fn safari_ios_emulation() -> wreq::Emulation {
// Only override the priority flag — keep wreq-util's SETTINGS, WINDOW_UPDATE,
// and pseudo-order intact. Replacing the whole Http2Options resets SETTINGS
// to defaults, which sends only INITIAL_WINDOW_SIZE and fails DataDome.
// to defaults, which sends only INITIAL_WINDOW_SIZE and fails strict WAFs.
if let Some(h2) = em.http2_options_mut().as_mut() {
h2.headers_stream_dependency = Some(StreamDependency::new(StreamId::zero(), 255, true));
}
@ -374,11 +368,11 @@ fn safari_ios_emulation() -> wreq::Emulation {
}
fn chrome_h2() -> Http2Options {
// SETTINGS frame matches bogdanfinn `chrome_133`: HEADER_TABLE_SIZE,
// SETTINGS frame matches real Chrome 133: HEADER_TABLE_SIZE,
// ENABLE_PUSH=0, INITIAL_WINDOW_SIZE, MAX_HEADER_LIST_SIZE. No
// MAX_CONCURRENT_STREAMS — real Chrome 133 and bogdanfinn both omit it,
// and indeed.com's WAF reads this as a bot signal when present. Priority
// weight 256 (encoded as 255 + 1) matches bogdanfinn's HEADERS frame.
// MAX_CONCURRENT_STREAMS — real Chrome 133 omits it, and some WAFs
// read its presence as a bot signal. Priority weight 256 (encoded as
// 255 + 1) matches a real Chrome HEADERS frame.
Http2Options::builder()
.initial_window_size(6_291_456)
.initial_connection_window_size(15_728_640)
@ -530,7 +524,22 @@ pub fn build_client(
max_redirects as usize,
))
.cookie_store(true)
.timeout(timeout);
.timeout(timeout)
// Fail fast on a black-holed host: a stuck connect aborts in ~5s
// instead of consuming the full request `timeout`. The total
// timeout above still bounds the overall request.
.connect_timeout(Duration::from_secs(5))
// Keep warm connections around for reuse (HTTP/2 multiplexing,
// cookie-warmup retries) but bound idle sockets so a long-lived
// rotating-proxy pool doesn't accumulate dead connections.
.pool_idle_timeout(Duration::from_secs(90))
.pool_max_idle_per_host(8)
// SO_KEEPALIVE so half-open connections through a proxy get torn
// down rather than hanging until the request timeout fires.
.tcp_keepalive(Duration::from_secs(60));
// Note: HTTP/2 keep-alive (PING) interval/timeout are part of the
// emulated fingerprint via `Http2Options` and are intentionally not
// overridden here — changing them would alter the browser fingerprint.
if let Some(proxy_url) = proxy {
let proxy = wreq::Proxy::all(proxy_url).map_err(|_| {

View file

@ -193,7 +193,7 @@ mod tests {
.await
.is_ok()
);
assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
assert!(!is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))));
}
#[tokio::test]

View file

@ -71,7 +71,7 @@ fn classify(body: &str, len: usize, status: u16, kw: &[String]) -> &'static str
"CHALLENGE"
} else if status == 403 || status == 429 {
"BLOCKED"
} else if status >= 300 && status < 400 {
} else if (300..400).contains(&status) {
"REDIRECT"
} else if len < 1000 {
"EMPTY"

View file

@ -3,8 +3,12 @@ name = "webclaw-llm"
description = "LLM integration for webclaw — local-first hybrid architecture (Ollama -> OpenAI -> Anthropic)"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
[lints]
workspace = true
[dependencies]
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
async-trait = "0.1"

View file

@ -1,5 +1,6 @@
/// LLM-specific errors. Kept flat — one enum covers transport, provider, and parsing failures.
#[derive(Debug, thiserror::Error)]
#[non_exhaustive]
pub enum LlmError {
#[error("HTTP error: {0}")]
Http(#[from] reqwest::Error),
@ -16,3 +17,53 @@ pub enum LlmError {
#[error("provider error: {0}")]
ProviderError(String),
}
/// Truncate a (possibly network-sourced) error body to at most `max` bytes,
/// stepping back to the nearest UTF-8 char boundary so we never panic on a
/// multibyte split. Shared by all provider error paths.
pub(crate) fn truncate_err(text: &str, max: usize) -> &str {
if text.len() <= max {
return text;
}
let mut end = max;
while end > 0 && !text.is_char_boundary(end) {
end -= 1;
}
&text[..end]
}
#[cfg(test)]
mod tests {
use super::truncate_err;
#[test]
fn short_text_unchanged() {
assert_eq!(truncate_err("hello", 500), "hello");
}
#[test]
fn exact_length_unchanged() {
assert_eq!(truncate_err("abcde", 5), "abcde");
}
#[test]
fn truncates_ascii() {
assert_eq!(truncate_err("abcdef", 3), "abc");
}
#[test]
fn never_splits_multibyte() {
// "é" is 2 bytes; cutting at 3 would land mid-char on the second "é".
let s = "aéé"; // bytes: a(1) é(2) é(2) = 5 bytes
let out = truncate_err(s, 3);
// Must step back to a valid boundary (after the first "é").
assert!(s.is_char_boundary(out.len()));
assert_eq!(out, "");
}
#[test]
fn boundary_step_back_to_zero_is_safe() {
let s = "😀"; // 4 bytes, single char
assert_eq!(truncate_err(s, 2), "");
}
}

View file

@ -1,8 +1,31 @@
/// webclaw-llm: LLM integration with local-first hybrid architecture.
///
/// Provider chain tries Ollama (local) first, falls back to OpenAI, then Anthropic.
/// Provides schema-based extraction, prompt extraction, and summarization
/// on top of webclaw-core's content pipeline.
//! webclaw-llm: LLM integration with local-first hybrid architecture.
//!
//! Provider chain tries Ollama (local) first, falls back to OpenAI, then Anthropic.
//! Provides schema-based extraction, prompt extraction, and summarization
//! on top of webclaw-core's content pipeline.
//!
//! ```no_run
//! use webclaw_llm::{ProviderChain, LlmProvider, CompletionRequest, Message};
//!
//! # async fn run() -> Result<(), webclaw_llm::LlmError> {
//! // Builds Ollama -> OpenAI -> Anthropic, including only configured providers.
//! let chain = ProviderChain::default().await;
//!
//! let request = CompletionRequest {
//! model: String::new(), // empty = each provider's default model
//! messages: vec![Message { role: "user".into(), content: "Hello".into() }],
//! temperature: None,
//! max_tokens: None,
//! json_mode: false,
//! };
//!
//! let answer = chain.complete(&request).await?;
//! println!("{answer}");
//! # Ok(())
//! # }
//! ```
#![deny(unsafe_code)]
pub mod chain;
pub mod clean;
pub mod error;

View file

@ -1,14 +1,16 @@
/// Anthropic provider — Claude models via api.anthropic.com.
/// Anthropic's API differs from OpenAI: system message is a top-level param,
/// not part of the messages array.
use std::time::Duration;
use async_trait::async_trait;
use serde_json::json;
use crate::clean::strip_thinking_tags;
use crate::error::LlmError;
use crate::error::{LlmError, truncate_err};
use crate::provider::{CompletionRequest, LlmProvider};
use super::load_api_key;
use super::{build_http_client, load_api_key};
const DEFAULT_ANTHROPIC_BASE_URL: &str = "https://api.anthropic.com/v1";
const ANTHROPIC_VERSION: &str = "2023-06-01";
@ -35,7 +37,7 @@ impl AnthropicProvider {
let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?;
Some(Self {
client: reqwest::Client::new(),
client: build_http_client(Duration::from_secs(120)),
key,
base_url: base_url
.or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok())
@ -108,11 +110,7 @@ impl LlmProvider for AnthropicProvider {
if !resp.status().is_success() {
let status = resp.status();
let text = resp.text().await.unwrap_or_default();
let safe_text = if text.len() > 500 {
&text[..500]
} else {
&text
};
let safe_text = truncate_err(&text, 500);
return Err(LlmError::ProviderError(format!(
"anthropic returned {status}: {safe_text}"
)));
@ -208,12 +206,17 @@ mod tests {
);
}
// Env var fallback tests mutate process-global state and race with parallel tests.
// The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
// ANTHROPIC_API_KEY is process-global; cargo runs tests in parallel
// threads. Serialize the env-mutating tests so setting the key in one
// can't race another asserting its absence (poison-tolerant).
static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
#[test]
#[ignore = "mutates process env; run with --test-threads=1"]
#[allow(unsafe_code)] // test-only env mutation, serialized by ENV_LOCK
fn env_var_key_fallback() {
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
// SAFETY: env mutation is serialized by ENV_LOCK; set_var/remove_var
// are unsafe on the 2024 toolchain.
unsafe { std::env::set_var("ANTHROPIC_API_KEY", "sk-ant-env") };
let provider = AnthropicProvider::new(None, None).expect("should construct from env");
assert_eq!(provider.key, "sk-ant-env");
@ -221,8 +224,11 @@ mod tests {
}
#[test]
#[ignore = "mutates process env; run with --test-threads=1"]
#[allow(unsafe_code)] // test-only env mutation, serialized by ENV_LOCK
fn no_key_returns_none() {
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
// SAFETY: env mutation is serialized by ENV_LOCK. Clear any ambient
// runner value so the absence assertion is deterministic.
unsafe { std::env::remove_var("ANTHROPIC_API_KEY") };
assert!(AnthropicProvider::new(None, None).is_none());
}

View file

@ -2,8 +2,26 @@ pub mod anthropic;
pub mod ollama;
pub mod openai;
use std::time::Duration;
use crate::error::LlmError;
/// Connect timeout shared by every provider. A dead or wrong host should fail
/// fast (so the chain can move to the next provider) rather than hang on the
/// OS default connect timeout.
pub(crate) const CONNECT_TIMEOUT: Duration = Duration::from_secs(3);
/// Build the HTTP client for a provider with a fixed connect timeout and a
/// caller-chosen overall request timeout. Falls back to `reqwest::Client::new()`
/// only if the builder somehow fails, so construction stays infallible.
pub(crate) fn build_http_client(request_timeout: Duration) -> reqwest::Client {
reqwest::Client::builder()
.connect_timeout(CONNECT_TIMEOUT)
.timeout(request_timeout)
.build()
.unwrap_or_else(|_| reqwest::Client::new())
}
/// Load an API key from an explicit override or an environment variable.
/// Returns `None` if neither is set or the value is empty.
pub(crate) fn load_api_key(override_key: Option<String>, env_var: &str) -> Option<String> {

View file

@ -1,12 +1,16 @@
/// Ollama provider — talks to a local Ollama instance (default localhost:11434).
/// First choice in the provider chain: free, private, fast on Apple Silicon.
use std::time::Duration;
use async_trait::async_trait;
use serde_json::json;
use crate::clean::strip_thinking_tags;
use crate::error::LlmError;
use crate::error::{LlmError, truncate_err};
use crate::provider::{CompletionRequest, LlmProvider};
use super::build_http_client;
pub struct OllamaProvider {
client: reqwest::Client,
base_url: String,
@ -23,8 +27,11 @@ impl OllamaProvider {
.or_else(|| std::env::var("OLLAMA_MODEL").ok())
.unwrap_or_else(|| "qwen3:8b".into());
// Ollama runs local models that can take a while to generate; keep the
// overall timeout generous, but cap connect time so an unreachable host
// fails fast and the chain can fall through to a cloud provider.
Self {
client: reqwest::Client::new(),
client: build_http_client(Duration::from_secs(120)),
base_url,
default_model,
}
@ -70,11 +77,7 @@ impl LlmProvider for OllamaProvider {
if !resp.status().is_success() {
let status = resp.status();
let text = resp.text().await.unwrap_or_default();
let safe_text = if text.len() > 500 {
&text[..500]
} else {
&text
};
let safe_text = truncate_err(&text, 500);
return Err(LlmError::ProviderError(format!(
"ollama returned {status}: {safe_text}"
)));
@ -140,12 +143,17 @@ mod tests {
assert_eq!(provider.default_model(), "phi3:mini");
}
// Env var fallback is a trivial `env::var().ok()` -- not worth the flakiness
// of manipulating process-global state. Run in isolation if needed:
// cargo test -p webclaw-llm env_var_fallback -- --ignored --test-threads=1
// OLLAMA_HOST / OLLAMA_MODEL are process-global; cargo runs tests in
// parallel threads. Serialize the env-mutating tests so one that sets a
// var can't race another asserting its absence (poison-tolerant).
static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
#[test]
#[ignore = "mutates process env; run with --test-threads=1"]
#[allow(unsafe_code)] // test-only env mutation, serialized by ENV_LOCK
fn env_var_fallback() {
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
// SAFETY: env mutation is serialized by ENV_LOCK; set_var/remove_var
// are unsafe on the 2024 toolchain.
unsafe {
std::env::set_var("OLLAMA_HOST", "http://remote:11434");
std::env::set_var("OLLAMA_MODEL", "mistral:7b");

View file

@ -1,12 +1,14 @@
/// OpenAI provider — works with api.openai.com and any OpenAI-compatible endpoint.
use std::time::Duration;
use async_trait::async_trait;
use serde_json::json;
use crate::clean::strip_thinking_tags;
use crate::error::LlmError;
use crate::error::{LlmError, truncate_err};
use crate::provider::{CompletionRequest, LlmProvider};
use super::load_api_key;
use super::{build_http_client, load_api_key};
pub struct OpenAiProvider {
client: reqwest::Client,
@ -69,7 +71,7 @@ impl OpenAiProvider {
let key = load_api_key(key_override, "OPENAI_API_KEY")?;
Some(Self {
client: reqwest::Client::new(),
client: build_http_client(Duration::from_secs(120)),
key,
base_url: base_url
.or_else(|| std::env::var("OPENAI_BASE_URL").ok())
@ -132,11 +134,7 @@ impl LlmProvider for OpenAiProvider {
if !resp.status().is_success() {
let status = resp.status();
let text = resp.text().await.unwrap_or_default();
let safe_text = if text.len() > 500 {
&text[..500]
} else {
&text
};
let safe_text = truncate_err(&text, 500);
return Err(LlmError::ProviderError(format!(
"openai returned {status}: {safe_text}"
)));
@ -276,12 +274,17 @@ mod tests {
assert_eq!(body["response_format"], json!({ "type": "text" }));
}
// Env var fallback tests mutate process-global state and race with parallel tests.
// The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
// OPENAI_API_KEY is process-global; cargo runs tests in parallel threads.
// Serialize the env-mutating tests so setting the key in one can't race
// another asserting its absence (poison-tolerant).
static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
#[test]
#[ignore = "mutates process env; run with --test-threads=1"]
#[allow(unsafe_code)] // test-only env mutation, serialized by ENV_LOCK
fn env_var_key_fallback() {
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
// SAFETY: env mutation is serialized by ENV_LOCK; set_var/remove_var
// are unsafe on the 2024 toolchain.
unsafe { std::env::set_var("OPENAI_API_KEY", "sk-env-key") };
let provider = OpenAiProvider::new(None, None, None).expect("should construct from env");
assert_eq!(provider.key, "sk-env-key");
@ -289,8 +292,11 @@ mod tests {
}
#[test]
#[ignore = "mutates process env; run with --test-threads=1"]
#[allow(unsafe_code)] // test-only env mutation, serialized by ENV_LOCK
fn no_key_returns_none() {
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
// SAFETY: env mutation is serialized by ENV_LOCK. Clear any ambient
// runner value so the absence assertion is deterministic.
unsafe { std::env::remove_var("OPENAI_API_KEY") };
assert!(OpenAiProvider::new(None, None, None).is_none());
}

View file

@ -3,8 +3,12 @@ name = "webclaw-mcp"
description = "MCP server for webclaw web extraction toolkit"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
[lints]
workspace = true
[[bin]]
name = "webclaw-mcp"
path = "src/main.rs"

View file

@ -498,30 +498,13 @@ impl WebclawMcp {
);
}
let current = webclaw_core::ExtractionResult {
content: webclaw_core::Content {
markdown: markdown.to_string(),
plain_text: markdown.to_string(),
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
metadata: webclaw_core::Metadata {
title: None,
description: None,
author: None,
published_date: None,
language: None,
url: Some(params.url.clone()),
site_name: None,
image: None,
favicon: None,
word_count: markdown.split_whitespace().count(),
},
domain_data: None,
structured_data: Vec::new(),
};
let content = webclaw_core::Content::default()
.with_markdown(markdown.to_string())
.with_plain_text(markdown.to_string());
let metadata = webclaw_core::Metadata::default()
.with_url(Some(params.url.clone()))
.with_word_count(markdown.split_whitespace().count());
let current = webclaw_core::ExtractionResult::new(metadata, content);
let content_diff = webclaw_core::diff::diff(&previous, &current);
Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())

View file

@ -3,8 +3,12 @@ name = "webclaw-pdf"
description = "PDF text extraction for webclaw"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
[lints]
workspace = true
[dependencies]
pdf-extract = "0.7"
thiserror = { workspace = true }

View file

@ -1,7 +1,9 @@
/// PDF text extraction for webclaw.
///
/// Uses pdf-extract (backed by lopdf) to pull text from PDF bytes.
/// No OCR -- text-based PDFs only. Scanned PDFs return EmptyPdf in Auto mode.
//! PDF text extraction for webclaw.
//!
//! Uses pdf-extract (backed by lopdf) to pull text from PDF bytes.
//! No OCR -- text-based PDFs only. Scanned PDFs return EmptyPdf in Auto mode.
#![forbid(unsafe_code)]
pub mod error;
pub use error::PdfError;
@ -64,9 +66,18 @@ pub fn extract_pdf(bytes: &[u8], mode: PdfMode) -> Result<PdfResult, PdfError> {
debug!(pages = page_count, "PDF document loaded");
// Extract text via pdf-extract (higher-level API over lopdf)
let text = pdf_extract::extract_text_from_mem(bytes)
.map_err(|e| PdfError::ExtractionFailed(e.to_string()))?;
// Extract text via pdf-extract (higher-level API over lopdf).
// pdf-extract has bare `panic!`/`unreachable!` sites on malformed input,
// so we isolate it in catch_unwind: a caught panic becomes a normal
// ExtractionFailed error instead of unwinding through our callers.
// AssertUnwindSafe is sound here: the closure only borrows `bytes` (a
// read-only slice) and we discard all closure state on a caught panic.
let extracted = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
pdf_extract::extract_text_from_mem(bytes)
}))
.map_err(|_| PdfError::ExtractionFailed("pdf-extract panicked on malformed input".into()))?;
let text = extracted.map_err(|e| PdfError::ExtractionFailed(e.to_string()))?;
let text = normalize_text(&text);

View file

@ -2,10 +2,14 @@
name = "webclaw-server"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
repository.workspace = true
description = "Minimal REST API server for self-hosting webclaw extraction. Wraps the OSS extraction crates with HTTP endpoints. NOT the production hosted API at api.webclaw.io — this is a stateless, single-binary reference server for local + self-hosted deployments."
[lints]
workspace = true
[[bin]]
name = "webclaw-server"
path = "src/main.rs"
@ -18,7 +22,7 @@ webclaw-pdf = { workspace = true }
axum = { version = "0.8", features = ["macros"] }
tokio = { workspace = true }
tower-http = { version = "0.6", features = ["trace", "cors"] }
tower-http = { version = "0.6", features = ["trace", "cors", "timeout"] }
clap = { workspace = true, features = ["derive", "env"] }
serde = { workspace = true }
serde_json = { workspace = true }
@ -27,3 +31,9 @@ tracing-subscriber = { workspace = true, features = ["env-filter"] }
anyhow = "1"
thiserror = { workspace = true }
subtle = "2.6"
[dev-dependencies]
# `ServiceExt::oneshot` drives the router in-process for hermetic handler
# tests (no TCP listener, no network).
tower = { version = "0.5", features = ["util"] }
http-body-util = "0.1"

View file

@ -26,12 +26,20 @@ use axum::{
};
use clap::Parser;
use tower_http::cors::{Any, CorsLayer};
use tower_http::timeout::TimeoutLayer;
use tower_http::trace::TraceLayer;
use tracing::info;
use tracing_subscriber::{EnvFilter, fmt};
use crate::state::AppState;
/// Hard ceiling on how long any single request may run before the server
/// returns `408 Request Timeout` and drops the work. Generous enough for a
/// cold scrape + LLM round-trip, but bounds the inline `/v1/crawl` handler
/// (up to 500 pages, no job queue) so a slow crawl can't pin a connection
/// and a worker indefinitely.
const REQUEST_TIMEOUT: Duration = Duration::from_secs(120);
#[derive(Parser, Debug)]
#[command(
name = "webclaw-server",
@ -84,8 +92,29 @@ async fn main() -> anyhow::Result<()> {
);
}
let state = AppState::new(args.api_key.clone())?;
let state = AppState::new(args.api_key.clone()).await?;
let app = build_app(state);
let addr = SocketAddr::from((args.host, args.port));
let listener = tokio::net::TcpListener::bind(addr).await?;
let auth_status = if args.api_key.is_some() {
"bearer auth required"
} else {
"open mode (no auth)"
};
info!(%addr, mode = auth_status, "webclaw-server listening");
axum::serve(listener, app).await?;
Ok(())
}
/// Build the fully-layered axum router for a given [`AppState`].
///
/// Split out from `main` so the handler tests can exercise the exact same
/// routing + middleware stack (auth, timeout) in-process via
/// `tower::ServiceExt::oneshot`, with no TCP listener.
fn build_app(state: AppState) -> Router {
let v1 = Router::new()
.route("/scrape", post(routes::scrape::scrape))
.route(
@ -102,7 +131,7 @@ async fn main() -> anyhow::Result<()> {
.route("/brand", post(routes::brand::brand))
.layer(from_fn_with_state(state.clone(), auth::require_bearer));
let app = Router::new()
Router::new()
.route("/health", get(routes::health::health))
.nest("/v1", v1)
.layer(
@ -115,20 +144,14 @@ async fn main() -> anyhow::Result<()> {
.allow_headers(Any)
.max_age(Duration::from_secs(3600)),
)
// Caps total request time; returns 408 if exceeded. Applied
// outermost so it covers every route, including the inline crawl.
.layer(TimeoutLayer::with_status_code(
axum::http::StatusCode::REQUEST_TIMEOUT,
REQUEST_TIMEOUT,
))
.layer(TraceLayer::new_for_http())
.with_state(state);
let addr = SocketAddr::from((args.host, args.port));
let listener = tokio::net::TcpListener::bind(addr).await?;
let auth_status = if args.api_key.is_some() {
"bearer auth required"
} else {
"open mode (no auth)"
};
info!(%addr, mode = auth_status, "webclaw-server listening");
axum::serve(listener, app).await?;
Ok(())
.with_state(state)
}
fn is_unspecified_addr(addr: IpAddr) -> bool {
@ -137,3 +160,133 @@ fn is_unspecified_addr(addr: IpAddr) -> bool {
IpAddr::V6(ip) => ip.is_unspecified(),
}
}
#[cfg(test)]
mod tests {
//! Hermetic handler tests. Each builds the real router via
//! [`build_app`] and drives it in-process with
//! [`tower::ServiceExt::oneshot`] — no TCP listener, no outbound
//! network. Endpoints that would fetch a URL are reached only on paths
//! that short-circuit before any network call (auth rejection, format
//! validation, the static `/v1/extractors` catalog, `/health`).
use super::*;
use axum::body::Body;
use axum::http::{Request, StatusCode};
use http_body_util::BodyExt;
use tower::ServiceExt;
const TEST_KEY: &str = "test-secret-key";
async fn app_with_key(key: Option<&str>) -> Router {
// `AppState::new` probes Ollama once at startup. With no Ollama
// running the probe returns fast (connection refused) and the
// tests below never touch the chain, so they stay hermetic either
// way — no env juggling required.
let state = AppState::new(key.map(str::to_owned))
.await
.expect("build state");
build_app(state)
}
fn get(uri: &str) -> Request<Body> {
Request::builder()
.uri(uri)
.body(Body::empty())
.expect("request")
}
fn get_auth(uri: &str, header: &str) -> Request<Body> {
Request::builder()
.uri(uri)
.header("authorization", header)
.body(Body::empty())
.expect("request")
}
async fn json_body(resp: axum::response::Response) -> serde_json::Value {
let bytes = resp.into_body().collect().await.expect("body").to_bytes();
serde_json::from_slice(&bytes).expect("json")
}
#[tokio::test]
async fn health_returns_version() {
let app = app_with_key(None).await;
let resp = app.oneshot(get("/health")).await.expect("response");
assert_eq!(resp.status(), StatusCode::OK);
let body = json_body(resp).await;
assert_eq!(body["status"], "ok");
assert_eq!(body["service"], "webclaw-server");
assert_eq!(body["version"], env!("CARGO_PKG_VERSION"));
}
#[tokio::test]
async fn missing_key_is_unauthorized() {
let app = app_with_key(Some(TEST_KEY)).await;
let resp = app.oneshot(get("/v1/extractors")).await.expect("response");
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
}
#[tokio::test]
async fn wrong_key_is_unauthorized() {
let app = app_with_key(Some(TEST_KEY)).await;
let resp = app
.oneshot(get_auth("/v1/extractors", "Bearer wrong-key"))
.await
.expect("response");
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
}
#[tokio::test]
async fn correct_key_authorized() {
let app = app_with_key(Some(TEST_KEY)).await;
// `/v1/extractors` is a static catalog — passes auth, no network.
let resp = app
.oneshot(get_auth("/v1/extractors", &format!("Bearer {TEST_KEY}")))
.await
.expect("response");
assert_eq!(resp.status(), StatusCode::OK);
}
#[tokio::test]
async fn lowercase_bearer_accepted() {
let app = app_with_key(Some(TEST_KEY)).await;
let resp = app
.oneshot(get_auth("/v1/extractors", &format!("bearer {TEST_KEY}")))
.await
.expect("response");
assert_eq!(resp.status(), StatusCode::OK);
}
#[tokio::test]
async fn open_mode_allows_unauthenticated() {
// No api key configured => auth middleware passes everything.
let app = app_with_key(None).await;
let resp = app.oneshot(get("/v1/extractors")).await.expect("response");
assert_eq!(resp.status(), StatusCode::OK);
}
#[tokio::test]
async fn unknown_format_is_bad_request() {
// Format validation now runs before the fetch, so a bogus format
// returns 400 without any network call.
let app = app_with_key(None).await;
let req = Request::builder()
.method("POST")
.uri("/v1/scrape")
.header("content-type", "application/json")
.body(Body::from(
r#"{"url":"https://example.com","formats":["bogus"]}"#,
))
.expect("request");
let resp = app.oneshot(req).await.expect("response");
assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
let body = json_body(resp).await;
assert!(
body["error"]
.as_str()
.is_some_and(|e| e.contains("unknown format")),
"expected unknown-format error, got {body:?}"
);
}
}

View file

@ -9,7 +9,7 @@ use axum::{Json, extract::State};
use serde::Deserialize;
use serde_json::{Value, json};
use std::time::Duration;
use webclaw_fetch::{CrawlConfig, Crawler, FetchConfig};
use webclaw_fetch::{CrawlConfig, Crawler};
use crate::{error::ApiError, state::AppState};
@ -30,7 +30,7 @@ pub struct CrawlRequest {
}
pub async fn crawl(
State(_state): State<AppState>,
State(state): State<AppState>,
Json(req): Json<CrawlRequest>,
) -> Result<Json<Value>, ApiError> {
if req.url.trim().is_empty() {
@ -42,7 +42,10 @@ pub async fn crawl(
let concurrency = req.concurrency.unwrap_or(5).min(20);
let config = CrawlConfig {
fetch: FetchConfig::default(),
// Inherit the shared client's profile/proxy/timeout instead of
// `FetchConfig::default()` (which is Chrome). The rest of the
// server fetches as Firefox; crawl now matches.
fetch: state.fetch_config().clone(),
max_depth,
max_pages,
concurrency,

View file

@ -36,36 +36,16 @@ impl PreviousSnapshot {
fn into_extraction(self) -> ExtractionResult {
match self {
Self::Full(r) => r,
Self::Minimal { markdown, metadata } => ExtractionResult {
metadata: metadata.unwrap_or_else(empty_metadata),
content: Content {
markdown,
plain_text: String::new(),
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
domain_data: None,
structured_data: Vec::new(),
},
Self::Minimal { markdown, metadata } => ExtractionResult::new(
metadata.unwrap_or_else(empty_metadata),
Content::default().with_markdown(markdown),
),
}
}
}
fn empty_metadata() -> Metadata {
Metadata {
title: None,
description: None,
author: None,
published_date: None,
language: None,
url: None,
site_name: None,
image: None,
favicon: None,
word_count: 0,
}
Metadata::default()
}
pub async fn diff_route(

View file

@ -4,14 +4,14 @@
//! * `schema` — JSON Schema describing what to extract.
//! * `prompt` — natural-language instructions.
//!
//! At least one must be provided. The provider chain is built per
//! request from env (Ollama -> OpenAI -> Anthropic). Self-hosters
//! get the same fallback behaviour as the CLI.
//! At least one must be provided. The provider chain (Ollama -> OpenAI
//! -> Anthropic) is built once at startup and shared via `AppState`.
//! Self-hosters get the same fallback behaviour as the CLI.
use axum::{Json, extract::State};
use serde::Deserialize;
use serde_json::{Value, json};
use webclaw_llm::{ProviderChain, extract::extract_json, extract::extract_with_prompt};
use webclaw_llm::{extract::extract_json, extract::extract_with_prompt};
use crate::{error::ApiError, state::AppState};
@ -59,7 +59,7 @@ pub async fn extract(
));
}
let chain = ProviderChain::default().await;
let chain = state.llm_chain();
if chain.is_empty() {
return Err(ApiError::Llm(
"no LLM providers configured (set OLLAMA_HOST, OPENAI_API_KEY, or ANTHROPIC_API_KEY)"
@ -69,10 +69,10 @@ pub async fn extract(
let model = req.model.as_deref();
let data = if let Some(schema) = req.schema.as_ref() {
extract_json(&content, schema, &chain, model).await?
extract_json(&content, schema, chain, model).await?
} else {
let prompt = req.prompt.as_deref().unwrap_or_default();
extract_with_prompt(&content, prompt, &chain, model).await?
extract_with_prompt(&content, prompt, chain, model).await?
};
Ok(Json(json!({

View file

@ -52,8 +52,18 @@ pub async fn scrape(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
let formats = req.formats.as_vec();
// Validate requested formats up front so a typo fails fast with a 400
// instead of after a full (wasted) fetch + extract.
if let Some(bad) = formats
.iter()
.find(|f| !matches!(f.as_str(), "markdown" | "text" | "llm" | "html" | "json"))
{
return Err(ApiError::bad_request(format!(
"unknown format: '{bad}' (allowed: markdown, text, llm, html, json)"
)));
}
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
let options = ExtractionOptions {
include_selectors: req.include_selectors,

View file

@ -3,7 +3,7 @@
use axum::{Json, extract::State};
use serde::Deserialize;
use serde_json::{Value, json};
use webclaw_llm::{ProviderChain, summarize::summarize};
use webclaw_llm::summarize::summarize;
use crate::{error::ApiError, state::AppState};
@ -36,7 +36,7 @@ pub async fn summarize_route(
));
}
let chain = ProviderChain::default().await;
let chain = state.llm_chain();
if chain.is_empty() {
return Err(ApiError::Llm(
"no LLM providers configured (set OLLAMA_HOST, OPENAI_API_KEY, or ANTHROPIC_API_KEY)"
@ -44,7 +44,7 @@ pub async fn summarize_route(
));
}
let summary = summarize(&content, req.max_sentences, &chain, req.model.as_deref()).await?;
let summary = summarize(&content, req.max_sentences, chain, req.model.as_deref()).await?;
Ok(Json(json!({
"url": req.url,

View file

@ -20,6 +20,7 @@ use std::sync::Arc;
use tracing::info;
use webclaw_fetch::cloud::CloudClient;
use webclaw_fetch::{BrowserProfile, FetchClient, FetchConfig};
use webclaw_llm::ProviderChain;
/// Single-process state shared across all request handlers.
#[derive(Clone)]
@ -34,6 +35,16 @@ struct Inner {
/// auto-deref `&Arc<FetchClient>` -> `&FetchClient`, so this costs
/// them nothing.
pub fetch: Arc<FetchClient>,
/// The exact [`FetchConfig`] the shared `fetch` client was built from.
/// Endpoints that spin up their own client (e.g. `/v1/crawl`, which
/// builds a `Crawler` with its own internal `FetchClient`) clone this
/// so they inherit the same browser profile / proxy / timeout instead
/// of silently falling back to `FetchConfig::default()` (Chrome).
pub fetch_config: FetchConfig,
/// LLM provider chain (Ollama -> OpenAI -> Anthropic), built once at
/// startup. `/v1/extract` and `/v1/summarize` borrow this instead of
/// rebuilding the chain (and re-probing Ollama) on every request.
pub llm_chain: Arc<ProviderChain>,
/// Inbound bearer-auth token for this server's own `/v1/*` surface.
pub api_key: Option<String>,
}
@ -45,12 +56,15 @@ impl AppState {
///
/// `inbound_api_key` is the bearer token clients must present;
/// cloud-fallback credentials come from the env (checked here).
pub fn new(inbound_api_key: Option<String>) -> anyhow::Result<Self> {
///
/// Async because the LLM provider chain probes Ollama for availability
/// once at startup; doing it here keeps it off the per-request hot path.
pub async fn new(inbound_api_key: Option<String>) -> anyhow::Result<Self> {
let config = FetchConfig {
browser: BrowserProfile::Firefox,
..FetchConfig::default()
};
let mut fetch = FetchClient::new(config)
let mut fetch = FetchClient::new(config.clone())
.map_err(|e| anyhow::anyhow!("failed to build fetch client: {e}"))?;
// Cloud fallback: only activates when the operator has provided
@ -66,9 +80,13 @@ impl AppState {
fetch = fetch.with_cloud(cloud);
}
let llm_chain = Arc::new(ProviderChain::default().await);
Ok(Self {
inner: Arc::new(Inner {
fetch: Arc::new(fetch),
fetch_config: config,
llm_chain,
api_key: inbound_api_key,
}),
})
@ -78,6 +96,19 @@ impl AppState {
&self.inner.fetch
}
/// The [`FetchConfig`] the shared client was built from. Cloned by
/// endpoints that need to construct their own client with identical
/// settings (currently `/v1/crawl`).
pub fn fetch_config(&self) -> &FetchConfig {
&self.inner.fetch_config
}
/// The shared LLM provider chain. Borrowed by `/v1/extract` and
/// `/v1/summarize`; `&ProviderChain` coerces to `&dyn LlmProvider`.
pub fn llm_chain(&self) -> &ProviderChain {
&self.inner.llm_chain
}
pub fn api_key(&self) -> Option<&str> {
self.inner.api_key.as_deref()
}

59
deny.toml Normal file
View file

@ -0,0 +1,59 @@
# cargo-deny configuration — supply-chain gate for the webclaw workspace.
# Run locally with `cargo deny check`; CI runs it via EmbarkStudios/cargo-deny-action.
#
# Scope of enforcement:
# advisories — fail on known RUSTSEC vulnerabilities / unmaintained crates
# bans — keep the dep tree lean and free of disallowed crates
# licenses — allow the AGPL-3.0 workspace plus permissive deps only
# sources — only crates.io and our own GitHub org
[graph]
# Evaluate all targets so a vuln gated behind a non-host platform still trips
# the gate. Keep this in sync with the platforms we actually ship.
all-features = true
[advisories]
version = 2
# Fail the build on any unfixed advisory by default (cargo-deny v2 errors on
# `vulnerability`/`unmaintained`/`unsound`/`yanked` unless explicitly ignored).
# Add specific RUSTSEC ids here with a justification only when a fix is not yet
# available upstream.
ignore = []
[bans]
# Warn (don't hard-fail) on duplicate versions of the same crate — common and
# usually benign in a tree this size; revisit if a duplicate becomes a problem.
multiple-versions = "warn"
wildcard-dependencies = "deny"
# Crates that must never enter the tree. Empty for now; this is where a banned
# transitive dep (e.g. an unmaintained TLS or crypto crate) would be listed.
deny = []
[licenses]
version = 2
# Permissive licenses we accept on dependencies, plus AGPL-3.0 for the
# workspace crates themselves. SPDX identifiers.
allow = [
"AGPL-3.0",
"MIT",
"Apache-2.0",
"Apache-2.0 WITH LLVM-exception",
"BSD-2-Clause",
"BSD-3-Clause",
"MPL-2.0",
"ISC",
"Unicode-3.0",
"Unicode-DFS-2016",
"Zlib",
"CC0-1.0",
]
# Crates with no SPDX expression in their manifest fail unless clarified here.
confidence-threshold = 0.8
[sources]
unknown-registry = "deny"
unknown-git = "deny"
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
[sources.allow-org]
github = ["0xMassi"]