From eede2f695374cc84775e378de13418b6decb0752 Mon Sep 17 00:00:00 2001 From: Valerio Date: Mon, 4 May 2026 12:08:11 +0200 Subject: [PATCH 01/49] docs: credit SSRF report --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0cc9ca..afec609 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). ## [0.5.7] — 2026-04-30 +### Security +- Hardened server-side URL fetching against SSRF by rejecting private/internal IP ranges and unsafe redirect targets across CLI, MCP, and the self-hosted REST server. Thanks to KairoKid / dodge1218 (vonbrubeck@gmail.com) for the responsible report. + ### Docs - README header now uses an `

webclaw

` instead of an `

` slogan. The repo had no heading-level brand anchor before, only a banner image, so search engines indexing the README were missing the canonical brand signal. The new heading is what GitHub renders as the title of the page and what Google co-ranks with webclaw.io. From 1c9def2fdeec7de26d50244431502c81731db7fa Mon Sep 17 00:00:00 2001 From: Valerio Date: Mon, 4 May 2026 14:30:06 +0200 Subject: [PATCH 02/49] fix: validate self-host route URLs consistently --- crates/webclaw-server/src/error.rs | 11 ++++++++++- crates/webclaw-server/src/routes/brand.rs | 3 ++- crates/webclaw-server/src/routes/crawl.rs | 5 +++-- crates/webclaw-server/src/routes/diff.rs | 3 ++- crates/webclaw-server/src/routes/extract.rs | 3 ++- crates/webclaw-server/src/routes/map.rs | 3 ++- crates/webclaw-server/src/routes/structured.rs | 5 +++-- crates/webclaw-server/src/routes/summarize.rs | 3 ++- 8 files changed, 26 insertions(+), 10 deletions(-) diff --git a/crates/webclaw-server/src/error.rs b/crates/webclaw-server/src/error.rs index 7f1d36e..a63848f 100644 --- a/crates/webclaw-server/src/error.rs +++ b/crates/webclaw-server/src/error.rs @@ -74,7 +74,16 @@ impl From for ApiError { webclaw_fetch::FetchError::InvalidUrl(msg) => { Self::BadRequest(format!("invalid url: {msg}")) } - other => Self::Fetch(other.to_string()), + other => { + let msg = other.to_string(); + if msg.contains("invalid url:") + || msg.contains("blocked private or internal address") + { + Self::BadRequest(msg) + } else { + Self::Fetch(msg) + } + } } } } diff --git a/crates/webclaw-server/src/routes/brand.rs b/crates/webclaw-server/src/routes/brand.rs index 908976a..f3f6a43 100644 --- a/crates/webclaw-server/src/routes/brand.rs +++ b/crates/webclaw-server/src/routes/brand.rs @@ -21,8 +21,9 @@ pub async fn brand( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; - let fetched = state.fetch().fetch(&req.url).await?; + let fetched = state.fetch().fetch(url.as_str()).await?; let brand = extract_brand(&fetched.html, Some(&fetched.url)); Ok(Json(json!({ diff --git a/crates/webclaw-server/src/routes/crawl.rs b/crates/webclaw-server/src/routes/crawl.rs index 4d15195..9ea484c 100644 --- a/crates/webclaw-server/src/routes/crawl.rs +++ b/crates/webclaw-server/src/routes/crawl.rs @@ -36,6 +36,7 @@ pub async fn crawl( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; let max_pages = req.max_pages.unwrap_or(50).min(HARD_MAX_PAGES); let max_depth = req.max_depth.unwrap_or(3); let concurrency = req.concurrency.unwrap_or(5).min(20); @@ -56,8 +57,8 @@ pub async fn crawl( cancel_flag: None, }; - let crawler = Crawler::new(&req.url, config).map_err(ApiError::from)?; - let result = crawler.crawl(&req.url, None).await; + let crawler = Crawler::new(url.as_str(), config).map_err(ApiError::from)?; + let result = crawler.crawl(url.as_str(), None).await; let pages: Vec = result .pages diff --git a/crates/webclaw-server/src/routes/diff.rs b/crates/webclaw-server/src/routes/diff.rs index e4e038d..b0706fb 100644 --- a/crates/webclaw-server/src/routes/diff.rs +++ b/crates/webclaw-server/src/routes/diff.rs @@ -75,8 +75,9 @@ pub async fn diff_route( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; - let current = state.fetch().fetch_and_extract(&req.url).await?; + let current = state.fetch().fetch_and_extract(url.as_str()).await?; let previous = req.previous.into_extraction(); let result = diff(&previous, ¤t); diff --git a/crates/webclaw-server/src/routes/extract.rs b/crates/webclaw-server/src/routes/extract.rs index 05b8909..55b34a0 100644 --- a/crates/webclaw-server/src/routes/extract.rs +++ b/crates/webclaw-server/src/routes/extract.rs @@ -43,10 +43,11 @@ pub async fn extract( "either `schema` or `prompt` is required", )); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; // Fetch + extract first so we feed the LLM clean markdown instead of // raw HTML. Cheaper tokens, better signal. - let extraction = state.fetch().fetch_and_extract(&req.url).await?; + let extraction = state.fetch().fetch_and_extract(url.as_str()).await?; let content = if extraction.content.markdown.trim().is_empty() { extraction.content.plain_text.clone() } else { diff --git a/crates/webclaw-server/src/routes/map.rs b/crates/webclaw-server/src/routes/map.rs index 846183a..6daec69 100644 --- a/crates/webclaw-server/src/routes/map.rs +++ b/crates/webclaw-server/src/routes/map.rs @@ -27,8 +27,9 @@ pub async fn map( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; - let entries = sitemap::discover(state.fetch(), &req.url).await?; + let entries = sitemap::discover(state.fetch(), url.as_str()).await?; let body = if req.include_metadata { json!({ diff --git a/crates/webclaw-server/src/routes/structured.rs b/crates/webclaw-server/src/routes/structured.rs index c9cdc1a..9c10b67 100644 --- a/crates/webclaw-server/src/routes/structured.rs +++ b/crates/webclaw-server/src/routes/structured.rs @@ -25,7 +25,7 @@ impl From for ApiError { match e { ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound, ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()), - ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()), + ExtractorDispatchError::Fetch(f) => ApiError::from(f), } } } @@ -46,7 +46,8 @@ pub async fn scrape_vertical( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } - let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?; + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; + let data = extractors::dispatch_by_name(state.fetch(), &vertical, url.as_str()).await?; Ok(Json(json!({ "vertical": vertical, "url": req.url, diff --git a/crates/webclaw-server/src/routes/summarize.rs b/crates/webclaw-server/src/routes/summarize.rs index b967f1f..6b645ab 100644 --- a/crates/webclaw-server/src/routes/summarize.rs +++ b/crates/webclaw-server/src/routes/summarize.rs @@ -22,8 +22,9 @@ pub async fn summarize_route( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; - let extraction = state.fetch().fetch_and_extract(&req.url).await?; + let extraction = state.fetch().fetch_and_extract(url.as_str()).await?; let content = if extraction.content.markdown.trim().is_empty() { extraction.content.plain_text.clone() } else { From 72b8dbc2852e0dbff0b961cbc9519877a7f364b4 Mon Sep 17 00:00:00 2001 From: Valerio Date: Mon, 4 May 2026 21:25:07 +0200 Subject: [PATCH 03/49] fix: improve brand extraction signals --- Cargo.lock | 14 +- Cargo.toml | 3 +- crates/webclaw-core/src/brand.rs | 264 ++++++++++++++++++++++++++----- 3 files changed, 234 insertions(+), 47 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b382000..4a6b90e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3219,7 +3219,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.5.6" +version = "0.5.8" dependencies = [ "clap", "dotenvy", @@ -3240,7 +3240,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.5.6" +version = "0.5.8" dependencies = [ "ego-tree", "once_cell", @@ -3258,7 +3258,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.5.6" +version = "0.5.8" dependencies = [ "async-trait", "bytes", @@ -3284,7 +3284,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.5.6" +version = "0.5.8" dependencies = [ "async-trait", "reqwest", @@ -3297,7 +3297,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.5.6" +version = "0.5.8" dependencies = [ "dirs", "dotenvy", @@ -3317,7 +3317,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.5.6" +version = "0.5.8" dependencies = [ "pdf-extract", "thiserror", @@ -3326,7 +3326,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.5.6" +version = "0.5.8" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index 9b55475..f77595d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.5.7" +version = "0.5.8" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" @@ -21,4 +21,3 @@ tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } clap = { version = "4", features = ["derive", "env"] } dotenvy = "0.15" - diff --git a/crates/webclaw-core/src/brand.rs b/crates/webclaw-core/src/brand.rs index 52eb1b7..8f6de53 100644 --- a/crates/webclaw-core/src/brand.rs +++ b/crates/webclaw-core/src/brand.rs @@ -79,9 +79,19 @@ static HSL_COLOR: Lazy = Lazy::new(|| { .unwrap() }); -/// Matches font-family values -static FONT_FAMILY: Lazy = - Lazy::new(|| Regex::new(r"(?i)font-family\s*:\s*([^;}{]+)").unwrap()); +/// Matches the family tail of CSS `font:` shorthand after size/line-height. +static FONT_SHORTHAND_FAMILY: Lazy = Lazy::new(|| { + Regex::new( + r#"(?ix) + (?:^|\s) + (?:xx-small|x-small|small|medium|large|x-large|xx-large|larger|smaller|\d*\.?\d+(?:px|rem|em|pt|pc|in|cm|mm|%|vw|vh|vmin|vmax)) + (?:\s*/\s*[^\s,]+)? + \s+ + (.+)$ + "#, + ) + .unwrap() +}); macro_rules! selector { ($s:expr) => {{ @@ -102,12 +112,12 @@ pub fn extract_brand(html: &str, url: Option<&str>) -> BrandIdentity { let doc = Html::parse_document(html); let base_url = url.and_then(|u| Url::parse(u).ok()); + let name = extract_brand_name(&doc); let css_sources = collect_css(&doc); - let colors = extract_colors(&css_sources); - let fonts = extract_fonts(&css_sources); + let colors = extract_colors(&css_sources, name.as_deref()); + let fonts = extract_fonts(&css_sources, name.as_deref()); let logo_url = find_logo(&doc, base_url.as_ref()); let favicon_url = find_favicon(&doc, base_url.as_ref()); - let name = extract_brand_name(&doc); let logos = find_all_logos(&doc, base_url.as_ref()); let og_image = find_og_image(&doc, base_url.as_ref()); @@ -390,7 +400,7 @@ fn is_boring_color(hex: &str) -> bool { ) } -fn extract_colors(decls: &[CssDecl]) -> Vec { +fn extract_colors(decls: &[CssDecl], brand_name: Option<&str>) -> Vec { // Track (hex, usage) -> count let mut counts: HashMap> = HashMap::new(); @@ -429,6 +439,8 @@ fn extract_colors(decls: &[CssDecl]) -> Vec { // Sort by frequency (descending) colors.sort_by_key(|c| std::cmp::Reverse(c.count)); + demote_or_remove_oauth_palette(&mut colors, brand_name); + // Promote top non-white/black to Primary/Secondary if they're still Unknown let mut assigned_primary = colors.iter().any(|c| c.usage == ColorUsage::Primary); let mut assigned_secondary = colors.iter().any(|c| c.usage == ColorUsage::Secondary); @@ -450,6 +462,28 @@ fn extract_colors(decls: &[CssDecl]) -> Vec { colors } +const GOOGLE_OAUTH_COLORS: &[&str] = &[ + "#1A73E8", "#4285F4", "#34A853", "#FBBC05", "#EA4335", "#5F6368", "#202124", "#E8EAED", + "#F1F3F4", +]; + +fn demote_or_remove_oauth_palette(colors: &mut Vec, brand_name: Option<&str>) { + let brand = brand_name.unwrap_or("").to_ascii_lowercase(); + if brand.contains("google") { + return; + } + + let google_hits = colors + .iter() + .filter(|c| GOOGLE_OAUTH_COLORS.contains(&c.hex.as_str())) + .count(); + if google_hits < 3 { + return; + } + + colors.retain(|c| !GOOGLE_OAUTH_COLORS.contains(&c.hex.as_str())); +} + fn classify_color_property(property: &str) -> ColorUsage { match property { "background-color" | "background" => ColorUsage::Background, @@ -584,31 +618,55 @@ const GENERIC_FONTS: &[&str] = &[ "initial", "unset", "revert", + "arial", + "times", + "times new roman", + "courier new", + "georgia", + "menlo", + "monaco", + "consolas", + "liberation mono", + "sf mono", + "sfmono-regular", + "source code pro", + "apple color emoji", + "segoe ui", + "segoe ui emoji", + "segoe ui symbol", + "noto color emoji", + "blinkmacsystemfont", + "-apple-system", ]; -fn extract_fonts(decls: &[CssDecl]) -> Vec { +fn extract_fonts(decls: &[CssDecl], brand_name: Option<&str>) -> Vec { let mut freq: HashMap = HashMap::new(); + let brand = brand_name.unwrap_or("").to_ascii_lowercase(); for decl in decls { if decl.property != "font-family" && decl.property != "font" { continue; } - // For shorthand `font:`, try to extract font-family portion + // For shorthand `font:`, extract only the family tail after the + // size/line-height token. The previous implementation treated values + // like `500 12px Roboto` as a font family, which polluted `/v1/brand` + // output with CSS declarations instead of usable family names. let family_str = if decl.property == "font" { - // font shorthand: the font-family is the last part after the size. - // Heuristic: take everything after a `/` or after `px`/`em`/`rem`/`%` + space - FONT_FAMILY - .captures(&format!("font-family: {}", &decl.value)) - .map(|c| c[1].to_string()) - .unwrap_or_else(|| decl.value.clone()) + match parse_font_shorthand_family(&decl.value) { + Some(family) => family, + None => continue, + } } else { decl.value.clone() }; for font in split_font_families(&family_str) { let lower = font.to_lowercase(); - if !GENERIC_FONTS.contains(&lower.as_str()) && !is_junk_font_name(&lower) { + if !GENERIC_FONTS.contains(&lower.as_str()) + && !is_junk_font_name(&lower) + && !is_third_party_auth_font(&lower, &brand) + { *freq.entry(font).or_insert(0) += 1; } } @@ -619,6 +677,32 @@ fn extract_fonts(decls: &[CssDecl]) -> Vec { fonts.into_iter().map(|(name, _)| name).collect() } +fn is_third_party_auth_font(name: &str, brand_name: &str) -> bool { + !brand_name.contains("google") && name.contains("google sans") +} + +fn parse_font_shorthand_family(value: &str) -> Option { + let caps = FONT_SHORTHAND_FAMILY.captures(value)?; + let mut family = caps.get(1)?.as_str().trim().to_string(); + + // Drop the optional slash line-height residue if it was not consumed due + // to unusual whitespace, then leave comma-separated family names intact. + if let Some(stripped) = family.strip_prefix('/') { + family = stripped + .split_once(' ') + .map(|(_, rest)| rest) + .unwrap_or("") + .trim() + .to_string(); + } + + if family.is_empty() { + None + } else { + Some(family) + } +} + /// Filter out junk font names: CSS variables, hex hashes (Next.js font optimization), /// single-character names, and other non-human-readable values. fn is_junk_font_name(name: &str) -> bool { @@ -630,10 +714,43 @@ fn is_junk_font_name(name: &str) -> bool { if name.len() >= 8 && name.chars().all(|c| c.is_ascii_hexdigit()) { return true; } + if name + .split_whitespace() + .next() + .is_some_and(|part| part.len() >= 8 && part.chars().all(|c| c.is_ascii_hexdigit())) + { + return true; + } // Too short to be a real font name if name.len() < 3 { return true; } + // Third-party rendering libraries and icon fonts overwhelm app shells + // like claude.com/openai.com but are not product typography. + if name.contains("katex") + || name.contains("open dyslexic") + || name.contains("opendyslexic") + || name.contains("math") + || name.contains("fraktur") + || name.contains("caligraphic") + || name.contains("typewriter") + || name.contains("glyph") + || name.contains("icon") + || name.contains("emoji") + || name.contains("symbol") + { + return true; + } + // Malformed shorthand leftovers and CSS-internal values. + if name.contains(')') + || name.contains('!') + || name.contains('/') + || name.contains("px ") + || name.contains("rem ") + || name.contains("em ") + { + return true; + } // Starts with underscore or double dash (CSS internals) if name.starts_with('_') || name.starts_with("--") { return true; @@ -662,28 +779,11 @@ fn split_font_families(value: &str) -> Vec { // --------------------------------------------------------------------------- fn find_logo(doc: &Html, base_url: Option<&Url>) -> Option { - // Strategy 1: with class/id containing "logo" - for el in doc.select(selector!("img")) { - let class = el.value().attr("class").unwrap_or(""); - let id = el.value().attr("id").unwrap_or(""); - if (contains_ci(class, "logo") || contains_ci(id, "logo")) - && let Some(src) = el.value().attr("src") - { - return Some(resolve_url(src, base_url)); - } + if let Some(url) = find_logo_in_scope(doc, base_url, "header img, nav img") { + return Some(url); } - // Strategy 2: with alt containing "logo" - for el in doc.select(selector!("img")) { - let alt = el.value().attr("alt").unwrap_or(""); - if contains_ci(alt, "logo") - && let Some(src) = el.value().attr("src") - { - return Some(resolve_url(src, base_url)); - } - } - - // Strategy 3: containing an (homepage link with image) + // Strategy 2: containing an (homepage link with image) for el in doc.select(selector!("a[href='/'] img, a[href] img")) { // Check if parent links to homepage if let Some(parent) = el.parent().and_then(|p| p.value().as_element()) { @@ -699,6 +799,20 @@ fn find_logo(doc: &Html, base_url: Option<&Url>) -> Option { None } +fn find_logo_in_scope(doc: &Html, base_url: Option<&Url>, selector_str: &str) -> Option { + let selector = Selector::parse(selector_str).ok()?; + for el in doc.select(&selector) { + let class = el.value().attr("class").unwrap_or(""); + let id = el.value().attr("id").unwrap_or(""); + let alt = el.value().attr("alt").unwrap_or(""); + let src = el.value().attr("src")?; + if contains_ci(class, "logo") || contains_ci(id, "logo") || contains_ci(alt, "logo") { + return Some(resolve_url(src, base_url)); + } + } + None +} + // --------------------------------------------------------------------------- // Favicon detection // --------------------------------------------------------------------------- @@ -829,8 +943,9 @@ fn find_all_logos(doc: &Html, base_url: Option<&Url>) -> Vec { } } - // Logo images (class/id/alt containing "logo") - for el in doc.select(selector!("img")) { + // Logo images in header/nav first. Product/customer logo grids elsewhere + // are common on SaaS sites and should not become the primary brand signal. + for el in doc.select(selector!("header img, nav img")) { let class = el.value().attr("class").unwrap_or(""); let id = el.value().attr("id").unwrap_or(""); let alt = el.value().attr("alt").unwrap_or(""); @@ -997,6 +1112,25 @@ mod tests { assert!(hexes.contains(&"#3498DB"), "brand color should survive"); } + #[test] + fn test_google_oauth_palette_does_not_overwhelm_non_google_brand() { + let html = r#" + + + "#; + + let brand = extract_brand(html, None); + let hexes: Vec<&str> = brand.colors.iter().map(|c| c.hex.as_str()).collect(); + assert!(!hexes.contains(&"#1A73E8")); + assert!(!hexes.contains(&"#4285F4")); + assert!(hexes.contains(&"#D97757")); + assert!(hexes.contains(&"#DC6038")); + } + #[test] fn test_font_extraction() { let html = r#""#; + + let brand = extract_brand(html, None); + assert!(brand.fonts.contains(&"Roboto".to_string())); + assert!(brand.fonts.contains(&"OpenAI Sans".to_string())); + assert!(!brand.fonts.iter().any(|f| f.contains("12px"))); + assert!(!brand.fonts.iter().any(|f| f.contains("KaTeX"))); + assert!(!brand.fonts.iter().any(|f| f.contains("Emoji"))); + assert!(!brand.fonts.iter().any(|f| f.contains("9d9927955a95a20d"))); + } + #[test] fn test_logo_by_class() { let html = r#" @@ -1086,6 +1238,42 @@ mod tests { ); } + #[test] + fn test_body_logo_grid_does_not_become_primary_brand_logo() { + let html = r#" +
+
+ + +
+
+ "#; + + let brand = extract_brand(html, Some("https://example.com")); + assert_eq!(brand.logo_url, None); + assert!(brand.logos.is_empty()); + } + + #[test] + fn test_header_logo_is_still_primary_logo() { + let html = r#" +
+ +
+
+ +
+ "#; + + let brand = extract_brand(html, Some("https://example.com")); + assert_eq!( + brand.logo_url.as_deref(), + Some("https://example.com/logo.svg") + ); + assert_eq!(brand.logos.len(), 1); + assert_eq!(brand.logos[0].url, "https://example.com/logo.svg"); + } + #[test] fn test_favicon() { let html = r#" From 615f3266603a7915b1e898c74ad1eb3a895e6c2f Mon Sep 17 00:00:00 2001 From: Valerio Date: Mon, 4 May 2026 21:52:49 +0200 Subject: [PATCH 04/49] docs: update changelog for brand extraction --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index afec609..01e4612 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.5.8] — 2026-05-04 + +### Fixed +- Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. + +--- + ## [0.5.7] — 2026-04-30 ### Security From a542e45768d54dc7f028485df7d18b6d8954b5e7 Mon Sep 17 00:00:00 2001 From: Justin Levine <20596508+jal-co@users.noreply.github.com> Date: Tue, 5 May 2026 02:17:21 -0700 Subject: [PATCH 05/49] docs: refresh README badges Replace README badges with shieldcn-styled badges. --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 623a4d3..4362d35 100644 --- a/README.md +++ b/README.md @@ -12,16 +12,16 @@

- Stars - Version - License - npm installs + Stars + Version + License + npm installs

- Discord - X / Twitter - Website - Docs + Discord + X / Twitter + Website + Docs

--- From a1242a1c1d116c142c6a98ee18e27f50a90d201d Mon Sep 17 00:00:00 2001 From: Valerio Date: Tue, 5 May 2026 11:18:58 +0200 Subject: [PATCH 06/49] docs: credit README badge refresh --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01e4612..53f636f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). ### Fixed - Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. +### Docs +- Refreshed the README badges with a cleaner shieldcn style. Thanks to Justin Levine (`@jal-co`) for the contribution, and shout-out to his open-source [shieldcn](https://github.com/jal-co/shieldcn) project. + --- ## [0.5.7] — 2026-04-30 From 513b0e493eaa7a7e47f5cb44880bb837be312477 Mon Sep 17 00:00:00 2001 From: SURYANSH MISHRA Date: Tue, 5 May 2026 11:38:30 +0200 Subject: [PATCH 07/49] ci: add Windows release artifacts Closes #34 --- .github/workflows/release.yml | 36 +++++++++++++++++++++++++++-------- CHANGELOG.md | 3 +++ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4c4c241..b2ea54a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -27,6 +27,8 @@ jobs: os: ubuntu-latest - target: aarch64-unknown-linux-gnu os: ubuntu-latest + - target: x86_64-pc-windows-msvc + os: windows-latest steps: - uses: actions/checkout@v4 @@ -57,6 +59,12 @@ jobs: if: matrix.target != 'aarch64-unknown-linux-gnu' && runner.os == 'Linux' run: sudo apt-get update && sudo apt-get install -y cmake + - name: Install NASM (Windows) + if: runner.os == 'Windows' + run: | + choco install nasm -y + echo "C:\Program Files\NASM" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + - name: Build run: cargo build --release --target ${{ matrix.target }} @@ -71,12 +79,22 @@ jobs: # don't repeat that mistake. If a future binary gets renamed or # removed, this step should scream, not quietly publish an # incomplete release. - cp target/${{ matrix.target }}/release/webclaw "$staging/" - cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/" - cp target/${{ matrix.target }}/release/webclaw-server "$staging/" - cp README.md LICENSE "$staging/" - tar czf "$staging.tar.gz" "$staging" - echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV + + if [[ "${{ matrix.os }}" == "windows-latest" ]]; then + cp target/${{ matrix.target }}/release/webclaw.exe "$staging/" + cp target/${{ matrix.target }}/release/webclaw-mcp.exe "$staging/" + cp target/${{ matrix.target }}/release/webclaw-server.exe "$staging/" + cp README.md LICENSE "$staging/" + 7z a -tzip "$staging.zip" "$staging" + echo "ASSET=$staging.zip" >> $GITHUB_ENV + else + cp target/${{ matrix.target }}/release/webclaw "$staging/" + cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/" + cp target/${{ matrix.target }}/release/webclaw-server "$staging/" + cp README.md LICENSE "$staging/" + tar czf "$staging.tar.gz" "$staging" + echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV + fi - name: Upload artifact uses: actions/upload-artifact@v4 @@ -99,7 +117,8 @@ jobs: run: | cd artifacts find . -name '*.tar.gz' -exec mv {} . \; - sha256sum *.tar.gz > SHA256SUMS + find . -name '*.zip' -exec mv {} . \; + sha256sum *.tar.gz *.zip > SHA256SUMS 2>/dev/null || sha256sum * > SHA256SUMS cat SHA256SUMS - name: Create GitHub Release @@ -108,6 +127,7 @@ jobs: generate_release_notes: true files: | artifacts/*.tar.gz + artifacts/*.zip artifacts/SHA256SUMS docker: @@ -181,7 +201,7 @@ jobs: tag="${GITHUB_REF#refs/tags/}" base="https://github.com/0xMassi/webclaw/releases/download/${tag}" - # Download all 4 tarballs and compute SHAs + # Download all tarballs (Linux + macOS) and compute SHAs for target in aarch64-apple-darwin x86_64-apple-darwin aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu; do curl -sSL "${base}/webclaw-${tag}-${target}.tar.gz" -o "${target}.tar.gz" done diff --git a/CHANGELOG.md b/CHANGELOG.md index 53f636f..4e2a0ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). ## [0.5.8] — 2026-05-04 +### Added +- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. + ### Fixed - Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. From 86183b11e4e4e8e695836a6b2b042f3df0994985 Mon Sep 17 00:00:00 2001 From: Valerio Date: Tue, 5 May 2026 11:44:07 +0200 Subject: [PATCH 08/49] docs: credit Windows release contribution --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e2a0ee..63d163f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). ## [0.5.8] — 2026-05-04 ### Added -- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. +- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. Thanks to Suryansh Mishra (`@notrealsuryansh`) for the contribution. ### Fixed - Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. From a3aa4bce6f7a9a4d1b4d3e8bdb78edea75042a73 Mon Sep 17 00:00:00 2001 From: Valerio Date: Wed, 6 May 2026 11:36:53 +0200 Subject: [PATCH 09/49] fix: support LLM provider compatibility options Closes #36 --- CHANGELOG.md | 1 + README.md | 3 + crates/webclaw-cli/src/main.rs | 5 +- crates/webclaw-llm/src/chain.rs | 2 +- crates/webclaw-llm/src/providers/anthropic.rs | 61 +++++++- crates/webclaw-llm/src/providers/openai.rs | 137 ++++++++++++++++-- 6 files changed, 193 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 63d163f..8e30acd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). ### Added - GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. Thanks to Suryansh Mishra (`@notrealsuryansh`) for the contribution. +- LLM providers now support `ANTHROPIC_BASE_URL` for Anthropic-compatible proxies, plus an `OPENAI_RESPONSE_FORMAT_TYPE` override for OpenAI-compatible backends such as LM Studio. Thanks to Toti (`@Toti330`) for the report. ### Fixed - Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. diff --git a/README.md b/README.md index 4362d35..79758f0 100644 --- a/README.md +++ b/README.md @@ -358,7 +358,10 @@ webclaw/ | `WEBCLAW_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) | | `OLLAMA_HOST` | Ollama URL for local LLM features (default: `http://localhost:11434`) | | `OPENAI_API_KEY` | OpenAI API key for LLM features | +| `OPENAI_BASE_URL` | OpenAI-compatible base URL (default: `https://api.openai.com/v1`) | +| `OPENAI_RESPONSE_FORMAT_TYPE` | JSON-mode response format for OpenAI-compatible backends: `json_object` (default), `json_schema`, or `text`. Use `text` or `json_schema` for LM Studio. | | `ANTHROPIC_API_KEY` | Anthropic API key for LLM features | +| `ANTHROPIC_BASE_URL` | Anthropic-compatible base URL (default: `https://api.anthropic.com/v1`) | | `WEBCLAW_PROXY` | Single proxy URL | | `WEBCLAW_PROXY_FILE` | Path to proxy pool file | diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index e97f15d..a45bce8 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -260,7 +260,7 @@ struct Cli { #[arg(long, env = "WEBCLAW_LLM_MODEL")] llm_model: Option, - /// Override the LLM base URL (Ollama or OpenAI-compatible) + /// Override the LLM base URL (Ollama, OpenAI-compatible, or Anthropic-compatible) #[arg(long, env = "WEBCLAW_LLM_BASE_URL")] llm_base_url: Option, @@ -1919,8 +1919,9 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> { Ok(Box::new(provider)) } "anthropic" => { - let provider = webclaw_llm::providers::anthropic::AnthropicProvider::new( + let provider = webclaw_llm::providers::anthropic::AnthropicProvider::with_base_url( None, + cli.llm_base_url.clone(), cli.llm_model.clone(), ) .ok_or("ANTHROPIC_API_KEY not set")?; diff --git a/crates/webclaw-llm/src/chain.rs b/crates/webclaw-llm/src/chain.rs index 314bf2a..86b0101 100644 --- a/crates/webclaw-llm/src/chain.rs +++ b/crates/webclaw-llm/src/chain.rs @@ -34,7 +34,7 @@ impl ProviderChain { providers.push(Box::new(openai)); } - if let Some(anthropic) = AnthropicProvider::new(None, None) { + if let Some(anthropic) = AnthropicProvider::with_base_url(None, None, None) { debug!("anthropic configured, adding to chain"); providers.push(Box::new(anthropic)); } diff --git a/crates/webclaw-llm/src/providers/anthropic.rs b/crates/webclaw-llm/src/providers/anthropic.rs index 71ca1f9..e6e43c8 100644 --- a/crates/webclaw-llm/src/providers/anthropic.rs +++ b/crates/webclaw-llm/src/providers/anthropic.rs @@ -10,23 +10,38 @@ use crate::provider::{CompletionRequest, LlmProvider}; use super::load_api_key; -const ANTHROPIC_API_URL: &str = "https://api.anthropic.com/v1/messages"; +const DEFAULT_ANTHROPIC_BASE_URL: &str = "https://api.anthropic.com/v1"; const ANTHROPIC_VERSION: &str = "2023-06-01"; pub struct AnthropicProvider { client: reqwest::Client, key: String, + base_url: String, default_model: String, } impl AnthropicProvider { /// Returns `None` if no API key is available (param or env). pub fn new(key_override: Option, model: Option) -> Option { + Self::with_base_url(key_override, None, model) + } + + /// Returns `None` if no API key is available (param or env). + pub fn with_base_url( + key_override: Option, + base_url: Option, + model: Option, + ) -> Option { let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?; Some(Self { client: reqwest::Client::new(), key, + base_url: base_url + .or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok()) + .unwrap_or_else(|| DEFAULT_ANTHROPIC_BASE_URL.into()) + .trim_end_matches('/') + .to_string(), default_model: model.unwrap_or_else(|| "claude-sonnet-4-20250514".into()), }) } @@ -34,6 +49,14 @@ impl AnthropicProvider { pub fn default_model(&self) -> &str { &self.default_model } + + fn messages_url(&self) -> String { + if self.base_url.ends_with("/messages") { + self.base_url.clone() + } else { + format!("{}/messages", self.base_url) + } + } } #[async_trait] @@ -74,7 +97,7 @@ impl LlmProvider for AnthropicProvider { let resp = self .client - .post(ANTHROPIC_API_URL) + .post(self.messages_url()) .header("x-api-key", &self.key) .header("anthropic-version", ANTHROPIC_VERSION) .header("content-type", "application/json") @@ -135,6 +158,11 @@ mod tests { assert_eq!(provider.name(), "anthropic"); assert_eq!(provider.default_model, "claude-sonnet-4-20250514"); assert_eq!(provider.key, "sk-ant-test"); + assert_eq!(provider.base_url, "https://api.anthropic.com/v1"); + assert_eq!( + provider.messages_url(), + "https://api.anthropic.com/v1/messages" + ); } #[test] @@ -151,6 +179,35 @@ mod tests { assert_eq!(provider.default_model(), "claude-sonnet-4-20250514"); } + #[test] + fn custom_base_url_appends_messages_path() { + let provider = AnthropicProvider::with_base_url( + Some("sk-ant-test".into()), + Some("https://proxy.example.test/anthropic/v1/".into()), + None, + ) + .unwrap(); + assert_eq!(provider.base_url, "https://proxy.example.test/anthropic/v1"); + assert_eq!( + provider.messages_url(), + "https://proxy.example.test/anthropic/v1/messages" + ); + } + + #[test] + fn custom_full_messages_url_is_not_doubled() { + let provider = AnthropicProvider::with_base_url( + Some("sk-ant-test".into()), + Some("https://proxy.example.test/v1/messages".into()), + None, + ) + .unwrap(); + assert_eq!( + provider.messages_url(), + "https://proxy.example.test/v1/messages" + ); + } + // Env var fallback tests mutate process-global state and race with parallel tests. // The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed: // cargo test -p webclaw-llm env_var -- --ignored --test-threads=1 diff --git a/crates/webclaw-llm/src/providers/openai.rs b/crates/webclaw-llm/src/providers/openai.rs index 6422cc4..3780d8f 100644 --- a/crates/webclaw-llm/src/providers/openai.rs +++ b/crates/webclaw-llm/src/providers/openai.rs @@ -13,6 +13,50 @@ pub struct OpenAiProvider { key: String, base_url: String, default_model: String, + response_format: OpenAiResponseFormat, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum OpenAiResponseFormat { + JsonObject, + JsonSchema, + Text, +} + +impl OpenAiResponseFormat { + fn from_env() -> Self { + std::env::var("OPENAI_RESPONSE_FORMAT_TYPE") + .ok() + .and_then(|value| Self::parse(&value)) + .unwrap_or(Self::JsonObject) + } + + fn parse(value: &str) -> Option { + match value.trim().to_ascii_lowercase().as_str() { + "" | "json_object" => Some(Self::JsonObject), + "json_schema" => Some(Self::JsonSchema), + "text" => Some(Self::Text), + _ => None, + } + } + + fn as_response_format(self) -> serde_json::Value { + match self { + Self::JsonObject => json!({ "type": "json_object" }), + Self::JsonSchema => json!({ + "type": "json_schema", + "json_schema": { + "name": "webclaw_response", + "schema": { + "type": "object", + "additionalProperties": true + }, + "strict": false + } + }), + Self::Text => json!({ "type": "text" }), + } + } } impl OpenAiProvider { @@ -31,23 +75,15 @@ impl OpenAiProvider { .or_else(|| std::env::var("OPENAI_BASE_URL").ok()) .unwrap_or_else(|| "https://api.openai.com/v1".into()), default_model: model.unwrap_or_else(|| "gpt-4o-mini".into()), + response_format: OpenAiResponseFormat::from_env(), }) } pub fn default_model(&self) -> &str { &self.default_model } -} - -#[async_trait] -impl LlmProvider for OpenAiProvider { - async fn complete(&self, request: &CompletionRequest) -> Result { - let model = if request.model.is_empty() { - &self.default_model - } else { - &request.model - }; + fn request_body(&self, request: &CompletionRequest, model: &str) -> serde_json::Value { let messages: Vec = request .messages .iter() @@ -60,7 +96,7 @@ impl LlmProvider for OpenAiProvider { }); if request.json_mode { - body["response_format"] = json!({ "type": "json_object" }); + body["response_format"] = self.response_format.as_response_format(); } if let Some(temp) = request.temperature { body["temperature"] = json!(temp); @@ -69,6 +105,21 @@ impl LlmProvider for OpenAiProvider { body["max_tokens"] = json!(max); } + body + } +} + +#[async_trait] +impl LlmProvider for OpenAiProvider { + async fn complete(&self, request: &CompletionRequest) -> Result { + let model = if request.model.is_empty() { + &self.default_model + } else { + &request.model + }; + + let body = self.request_body(request, model); + let url = format!("{}/chat/completions", self.base_url); let resp = self .client @@ -136,6 +187,7 @@ mod tests { assert_eq!(provider.default_model, "gpt-4o-mini"); assert_eq!(provider.base_url, "https://api.openai.com/v1"); assert_eq!(provider.key, "test-key-123"); + assert_eq!(provider.response_format, OpenAiResponseFormat::JsonObject); } #[test] @@ -161,6 +213,69 @@ mod tests { assert_eq!(provider.default_model(), "gpt-4o-mini"); } + #[test] + fn json_mode_defaults_to_openai_json_object() { + let provider = OpenAiProvider::new( + Some("test-key".into()), + Some("https://api.openai.com/v1".into()), + None, + ) + .unwrap(); + let req = CompletionRequest { + model: String::new(), + messages: vec![], + temperature: None, + max_tokens: None, + json_mode: true, + }; + let body = provider.request_body(&req, provider.default_model()); + assert_eq!(body["response_format"], json!({ "type": "json_object" })); + } + + #[test] + fn json_schema_response_format_for_compatible_backends() { + let mut provider = OpenAiProvider::new( + Some("test-key".into()), + Some("http://localhost:1234/v1".into()), + Some("local-model".into()), + ) + .unwrap(); + provider.response_format = OpenAiResponseFormat::JsonSchema; + let req = CompletionRequest { + model: String::new(), + messages: vec![], + temperature: None, + max_tokens: None, + json_mode: true, + }; + let body = provider.request_body(&req, provider.default_model()); + assert_eq!(body["response_format"]["type"], "json_schema"); + assert_eq!( + body["response_format"]["json_schema"]["schema"]["type"], + "object" + ); + } + + #[test] + fn text_response_format_for_lm_studio() { + let mut provider = OpenAiProvider::new( + Some("test-key".into()), + Some("http://localhost:1234/v1".into()), + Some("local-model".into()), + ) + .unwrap(); + provider.response_format = OpenAiResponseFormat::Text; + let req = CompletionRequest { + model: String::new(), + messages: vec![], + temperature: None, + max_tokens: None, + json_mode: true, + }; + let body = provider.request_body(&req, provider.default_model()); + assert_eq!(body["response_format"], json!({ "type": "text" })); + } + // Env var fallback tests mutate process-global state and race with parallel tests. // The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed: // cargo test -p webclaw-llm env_var -- --ignored --test-threads=1 From e6a95f783dd9eea4fe0b34bfc0e8f70bf3ff74f5 Mon Sep 17 00:00:00 2001 From: Valerio Date: Wed, 6 May 2026 11:42:09 +0200 Subject: [PATCH 10/49] chore: bump version to 0.5.9 --- CHANGELOG.md | 8 +++++++- Cargo.lock | 14 +++++++------- Cargo.toml | 2 +- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e30acd..7858ae4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,17 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.5.9] — 2026-05-06 + +### Fixed +- LLM providers now support `ANTHROPIC_BASE_URL` for Anthropic-compatible proxies, plus an `OPENAI_RESPONSE_FORMAT_TYPE` override for OpenAI-compatible backends such as LM Studio. Thanks to Toti (`@Toti330`) for the report. + +--- + ## [0.5.8] — 2026-05-04 ### Added - GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. Thanks to Suryansh Mishra (`@notrealsuryansh`) for the contribution. -- LLM providers now support `ANTHROPIC_BASE_URL` for Anthropic-compatible proxies, plus an `OPENAI_RESPONSE_FORMAT_TYPE` override for OpenAI-compatible backends such as LM Studio. Thanks to Toti (`@Toti330`) for the report. ### Fixed - Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. diff --git a/Cargo.lock b/Cargo.lock index 4a6b90e..e49ccc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3219,7 +3219,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.5.8" +version = "0.5.9" dependencies = [ "clap", "dotenvy", @@ -3240,7 +3240,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.5.8" +version = "0.5.9" dependencies = [ "ego-tree", "once_cell", @@ -3258,7 +3258,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.5.8" +version = "0.5.9" dependencies = [ "async-trait", "bytes", @@ -3284,7 +3284,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.5.8" +version = "0.5.9" dependencies = [ "async-trait", "reqwest", @@ -3297,7 +3297,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.5.8" +version = "0.5.9" dependencies = [ "dirs", "dotenvy", @@ -3317,7 +3317,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.5.8" +version = "0.5.9" dependencies = [ "pdf-extract", "thiserror", @@ -3326,7 +3326,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.5.8" +version = "0.5.9" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index f77595d..12a4b73 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.5.8" +version = "0.5.9" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" From 7f7514395415484e0e9da3ad5178e0578917e09d Mon Sep 17 00:00:00 2001 From: Valerio Date: Wed, 6 May 2026 17:16:35 +0200 Subject: [PATCH 11/49] docs: update hosted api trial copy --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 79758f0..7d936c6 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ It extracts clean, structured content from any URL using Chrome-level TLS finger **Self-host.** Free, AGPL-3.0, runs locally. Get the CLI, MCP server, or REST API in one command. Ships with the 8 core extraction tools: scrape, crawl, map, batch, extract, summarize, diff, brand. -**Hosted API** at **[webclaw.io](https://webclaw.io)**. 500 pages/month free, no card. Adds what self-hosting can't do alone: antibot bypass (Cloudflare, DataDome, WAF), JS rendering, async crawl/research jobs, web search, watches. For when you want it to *just work*. +**Hosted API** at **[webclaw.io](https://webclaw.io)**. Start with a 7-day Starter trial, card required. Adds what self-hosting can't do alone: antibot bypass (Cloudflare, DataDome, WAF), JS rendering, async crawl/research jobs, web search, watches. For when you want it to *just work*. --- From e8ca1417d699d977fd4d08af435758be127e7226 Mon Sep 17 00:00:00 2001 From: devnen Date: Sun, 10 May 2026 15:11:12 +0200 Subject: [PATCH 12/49] Improve --format llm output quality (#37) Improve LLM-format output for modern news and documentation pages. - Filter noisy hydration and low-value page chrome structured data while preserving content-bearing Schema.org records - Fix element/text spacing without detaching punctuation on docs, forums, and reference pages - Remove common accessibility link chrome from LLM text and link labels - Bump workspace version to 0.6.0 and update the changelog Thanks to Nenad Oric (@devnen) for the original PR and contribution. --- CHANGELOG.md | 9 ++ Cargo.lock | 14 +-- Cargo.toml | 2 +- crates/webclaw-core/src/llm/body.rs | 3 + crates/webclaw-core/src/llm/cleanup.rs | 83 ++++++++++++++ crates/webclaw-core/src/llm/links.rs | 25 +++++ crates/webclaw-core/src/llm/mod.rs | 148 ++++++++++++++++++++++++- crates/webclaw-core/src/markdown.rs | 103 ++++++++++++++++- 8 files changed, 371 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7858ae4..025b1db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,15 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.6.0] — 2026-05-10 + +### Fixed +- Improved `--format llm` output quality on modern news and documentation pages. Framework hydration blobs and low-value page chrome structured-data records are now filtered out before they can flood the LLM context, while content-bearing Schema.org records are preserved. Thanks and congrats to Nenad Oric (`@devnen`) for the contribution in PR #37. +- Fixed element-to-text spacing so adjacent inline nodes no longer smash words together, while punctuation stays attached on real pages such as docs, forums, and reference sites. +- Removed common screen-reader-only link chrome such as "opens new tab" from LLM body text and link labels without stripping ordinary prose that happens to mention external links. + +--- + ## [0.5.9] — 2026-05-06 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index e49ccc3..ab23a3f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3219,7 +3219,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.5.9" +version = "0.6.0" dependencies = [ "clap", "dotenvy", @@ -3240,7 +3240,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.5.9" +version = "0.6.0" dependencies = [ "ego-tree", "once_cell", @@ -3258,7 +3258,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.5.9" +version = "0.6.0" dependencies = [ "async-trait", "bytes", @@ -3284,7 +3284,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.5.9" +version = "0.6.0" dependencies = [ "async-trait", "reqwest", @@ -3297,7 +3297,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.5.9" +version = "0.6.0" dependencies = [ "dirs", "dotenvy", @@ -3317,7 +3317,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.5.9" +version = "0.6.0" dependencies = [ "pdf-extract", "thiserror", @@ -3326,7 +3326,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.5.9" +version = "0.6.0" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index 12a4b73..6e87225 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.5.9" +version = "0.6.0" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-core/src/llm/body.rs b/crates/webclaw-core/src/llm/body.rs index 5311121..db2a011 100644 --- a/crates/webclaw-core/src/llm/body.rs +++ b/crates/webclaw-core/src/llm/body.rs @@ -29,6 +29,9 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody { // 0c. Strip leaked JavaScript (framework hydration, self.__wrap_n, etc.) let text = cleanup::strip_leaked_js(&text); + // 0c2. Strip a11y link chrome ("opens new tab", external link hints) + let text = cleanup::strip_a11y_link_chrome(&text); + // 0d. Collapse spaced-out text (CSS animation artifacts like "S t a r t") // Must run before any dedup -- spaced text confuses word-based dedup. let text = cleanup::collapse_spaced_text(&text); diff --git a/crates/webclaw-core/src/llm/cleanup.rs b/crates/webclaw-core/src/llm/cleanup.rs index c8e14ed..dc447a5 100644 --- a/crates/webclaw-core/src/llm/cleanup.rs +++ b/crates/webclaw-core/src/llm/cleanup.rs @@ -146,6 +146,45 @@ pub(crate) fn strip_leaked_js(input: &str) -> String { out } +// --------------------------------------------------------------------------- +// Accessibility link chrome ("opens new tab", "external link") +// --------------------------------------------------------------------------- + +/// Strip screen-reader-only link chrome that bleeds into rendered text. +/// +/// Sites like Reuters wrap external/new-window links with hidden spans +/// like `, opens new tab`. The noise +/// filter can't reliably catch these (no consistent class hook across +/// sites), so they end up duplicated all over the body text. This is a +/// targeted text-level scrub of the most common phrasings. +pub(crate) fn strip_a11y_link_chrome(input: &str) -> String { + static A11Y_PATTERN: Lazy = Lazy::new(|| { + Regex::new( + r"(?i)(?:\s*,\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?|\s+\((?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\)\.?|\s+external link\b\.?$)", + ) + .unwrap() + }); + + let mut out = String::with_capacity(input.len()); + let mut in_code_fence = false; + for (i, line) in input.lines().enumerate() { + if i > 0 { + out.push('\n'); + } + if line.trim().starts_with("```") { + in_code_fence = !in_code_fence; + out.push_str(line); + continue; + } + if in_code_fence { + out.push_str(line); + continue; + } + out.push_str(&A11Y_PATTERN.replace_all(line, "")); + } + out +} + // --------------------------------------------------------------------------- // Spaced-out text collapsing (CSS animation artifacts) // --------------------------------------------------------------------------- @@ -1356,4 +1395,48 @@ mod tests { let input = "```\nImage of something in code\n```"; assert_eq!(strip_alt_text_noise(input), input); } + + #[test] + fn a11y_strips_opens_new_tab() { + let input = "Download the App, opens new tab and Subscribe, opens new tab."; + let out = strip_a11y_link_chrome(input); + assert!(!out.to_lowercase().contains("opens new tab"), "leak: {out}"); + assert!(out.contains("Download the App")); + assert!(out.contains("Subscribe")); + } + + #[test] + fn a11y_strips_external_link_variants() { + let cases = [ + ("Visit our docs, opens external link", "Visit our docs"), + ("Click here, opens in a new window.", "Click here"), + ("More info external link", "More info"), + ]; + for (input, expected_prefix) in cases { + let out = strip_a11y_link_chrome(input); + assert!( + out.starts_with(expected_prefix), + "input={input:?} got={out:?}" + ); + assert!(!out.to_lowercase().contains("opens"), "leak: {out}"); + } + } + + #[test] + fn a11y_preserves_code_blocks() { + let input = "```\nopens new tab is a function\n```\nDownload, opens new tab"; + let out = strip_a11y_link_chrome(input); + assert!( + out.contains("opens new tab is a function"), + "code stripped: {out}" + ); + // Outside the fence, the chrome is removed. + assert!(!out.to_lowercase().contains("download, opens new tab")); + } + + #[test] + fn a11y_preserves_external_link_prose() { + let input = "Researchers found an external link between the two incidents."; + assert_eq!(strip_a11y_link_chrome(input), input); + } } diff --git a/crates/webclaw-core/src/llm/links.rs b/crates/webclaw-core/src/llm/links.rs index 0656aac..3d25179 100644 --- a/crates/webclaw-core/src/llm/links.rs +++ b/crates/webclaw-core/src/llm/links.rs @@ -88,10 +88,19 @@ fn is_noise_link(text: &str, href: &str) -> bool { static MD_MARKERS_RE: Lazy = Lazy::new(|| Regex::new(r"#{1,6}\s+|\*{1,2}|_{1,2}|`").unwrap()); +static A11Y_LABEL_RE: Lazy = Lazy::new(|| { + Regex::new( + r"(?i)(?:\s*,?\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website))\b\.?|\s*,\s*external link\b\.?|\s+external link\b\.?$)", + ) + .unwrap() +}); + /// Clean a link label: strip markdown, dedup repeated phrases, truncate. pub(crate) fn clean_link_label(raw: &str) -> String { // Strip markdown markers let label = MD_MARKERS_RE.replace_all(raw, "").to_string(); + // Strip a11y link chrome ("opens new tab", etc.) + let label = A11Y_LABEL_RE.replace_all(&label, "").to_string(); let label = label.split_whitespace().collect::>().join(" "); // Dedup repeated phrases in label @@ -181,4 +190,20 @@ mod tests { assert!(is_noise_link("user", "https://hn.com/user?id=foo")); assert!(!is_noise_link("Rust docs", "https://rust-lang.org")); } + + #[test] + fn link_label_preserves_external_link_prose() { + assert_eq!( + clean_link_label("Research found an external link between incidents"), + "Research found an external link between incidents" + ); + } + + #[test] + fn link_label_strips_terminal_external_link_chrome() { + assert_eq!( + clean_link_label("Reuters story external link"), + "Reuters story" + ); + } } diff --git a/crates/webclaw-core/src/llm/mod.rs b/crates/webclaw-core/src/llm/mod.rs index 126558f..bc65be6 100644 --- a/crates/webclaw-core/src/llm/mod.rs +++ b/crates/webclaw-core/src/llm/mod.rs @@ -46,15 +46,73 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String { } // -- 4. Structured data (NEXT_DATA, SvelteKit, JSON-LD) -- - if !result.structured_data.is_empty() { - out.push_str("\n\n## Structured Data\n\n```json\n"); - out.push_str(&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default()); - out.push_str("\n```"); + // Only emit useful items: Schema.org records with a meaningful @type, + // and only if the total serialized size stays under a budget. Framework + // hydration blobs (Next.js pageProps full of ad-targeting flags, build + // IDs, schedule paths) explode to hundreds of KB and drown the LLM in + // noise — drop them rather than ship them. + let useful: Vec<_> = result + .structured_data + .iter() + .filter(|v| is_useful_structured_data(v)) + .cloned() + .collect(); + if !useful.is_empty() { + let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default(); + const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024; + if serialized.len() <= STRUCTURED_DATA_MAX_BYTES { + out.push_str("\n\n## Structured Data\n\n```json\n"); + out.push_str(&serialized); + out.push_str("\n```"); + } } out.trim().to_string() } +/// Decide whether a structured-data value carries content worth emitting. +/// +/// Schema.org records with a recognizable content `@type` (Article, NewsArticle, +/// Product, Recipe, FAQPage, HowTo, Event, Person, Organization, BreadcrumbList, +/// VideoObject, JobPosting, etc.) are kept. Generic `WebSite` / `WebPage` / +/// `ItemList` records and Next.js `pageProps`-style blobs without a useful +/// `@type` are dropped — they're almost always navigation chrome or framework +/// hydration state. +fn is_useful_structured_data(v: &serde_json::Value) -> bool { + let Some(obj) = v.as_object() else { + // SvelteKit can emit compact arrays of page data. Keep those if they + // are small enough to be useful, while still dropping giant hydration + // arrays under the same budget as untyped objects. + if v.is_array() { + let serialized = serde_json::to_string(v).unwrap_or_default(); + return serialized.len() <= 4 * 1024; + } + return false; + }; + // JSON-LD: @type drives the decision. + if let Some(t) = obj.get("@type") { + let types: Vec = match t { + serde_json::Value::String(s) => vec![s.to_ascii_lowercase()], + serde_json::Value::Array(a) => a + .iter() + .filter_map(|x| x.as_str()) + .map(str::to_ascii_lowercase) + .collect(), + _ => Vec::new(), + }; + if types.is_empty() { + return false; + } + // Drop low-info chrome types. + const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"]; + return types.iter().any(|t| !DROP_TYPES.iter().any(|d| t == d)); + } + // Next.js pageProps / SvelteKit data without @type: keep only if compact. + // Anything over ~4KB is almost certainly hydration state, not content. + let serialized = serde_json::to_string(v).unwrap_or_default(); + serialized.len() <= 4 * 1024 +} + // --------------------------------------------------------------------------- // Integration tests that exercise the full pipeline through to_llm_text // --------------------------------------------------------------------------- @@ -700,4 +758,86 @@ mod tests { assert!(out.contains("Some content"), "Content before lost: {out}"); assert!(out.contains("More content"), "Content after lost: {out}"); } + + // -- Structured-data gating tests -- + + fn make_result_with_structured(values: Vec) -> ExtractionResult { + let mut r = make_result("# Body"); + r.structured_data = values; + r + } + + #[test] + fn structured_data_drops_chrome_types() { + // WebSite/WebPage records are framework chrome — should be dropped. + let r = make_result_with_structured(vec![serde_json::json!({ + "@type": "WebSite", + "name": "Example", + "url": "https://example.com" + })]); + let out = to_llm_text(&r, None); + assert!( + !out.contains("## Structured Data"), + "WebSite chrome leaked into output: {out}" + ); + } + + #[test] + fn structured_data_keeps_article_types() { + let r = make_result_with_structured(vec![serde_json::json!({ + "@type": "NewsArticle", + "headline": "Big news", + "datePublished": "2026-05-10" + })]); + let out = to_llm_text(&r, None); + assert!( + out.contains("## Structured Data"), + "NewsArticle dropped: {out}" + ); + assert!(out.contains("Big news")); + } + + #[test] + fn structured_data_drops_oversized_blob() { + // 32KB pageProps-style blob with no @type — should be dropped. + let big = "x".repeat(32 * 1024); + let r = make_result_with_structured(vec![serde_json::json!({ + "buildId": "abc", + "isFallback": false, + "noise": big + })]); + let out = to_llm_text(&r, None); + assert!( + !out.contains("## Structured Data"), + "Oversized untyped blob leaked: len={}", + out.len() + ); + } + + #[test] + fn structured_data_keeps_compact_untyped() { + // Small untyped record (e.g. a parsed pageProps with real content) — keep. + let r = make_result_with_structured(vec![serde_json::json!({ + "title": "Hi", + "body": "small enough to keep" + })]); + let out = to_llm_text(&r, None); + assert!( + out.contains("## Structured Data"), + "Compact untyped dropped: {out}" + ); + } + + #[test] + fn structured_data_keeps_compact_untyped_array() { + // SvelteKit can emit compact arrays rather than objects. + let r = make_result_with_structured(vec![serde_json::json!([ + { "title": "Hi", "body": "small array item" } + ])]); + let out = to_llm_text(&r, None); + assert!( + out.contains("small array item"), + "Compact untyped array dropped: {out}" + ); + } } diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs index d0a2c23..2699166 100644 --- a/crates/webclaw-core/src/markdown.rs +++ b/crates/webclaw-core/src/markdown.rs @@ -320,6 +320,9 @@ fn children_to_md( } } Node::Text(text) => { + if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) { + out.push(' '); + } out.push_str(text); } _ => {} @@ -350,6 +353,9 @@ fn inline_text( } } Node::Text(text) => { + if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) { + out.push(' '); + } out.push_str(text); } _ => {} @@ -361,11 +367,65 @@ fn inline_text( /// Check whether a space is needed between two adjacent chunks of output. /// Returns true when the left side doesn't end with whitespace and the right -/// side doesn't start with whitespace — i.e., two words would be mashed together. +/// side doesn't start with whitespace, except around punctuation that should +/// bind to the adjacent token. fn needs_separator(left: &str, right: &str) -> bool { - let l = left.as_bytes().last().copied().unwrap_or(b' '); - let r = right.as_bytes().first().copied().unwrap_or(b' '); - !l.is_ascii_whitespace() && !r.is_ascii_whitespace() + let l = left.chars().next_back().unwrap_or(' '); + let r = right.chars().next().unwrap_or(' '); + + if l.is_whitespace() || r.is_whitespace() { + return false; + } + + // Do not create "word ," / "word )" / "word 's" artifacts. + if is_closing_punctuation(r) { + return false; + } + + // Do not create "( word" / "[ 1" artifacts. + if is_opening_punctuation(l) { + return false; + } + + // Common inline-code suffixes: `Option`s, `x`'s. Treat them like a + // single token rather than separating the text node. + if matches!(l, '`' | ')') && starts_with_inline_code_suffix(right) { + return false; + } + + true +} + +fn starts_with_inline_code_suffix(s: &str) -> bool { + let trimmed = s.trim_start_matches(['*', '_']); + let mut chars = trimmed.chars(); + let Some(first) = chars.next() else { + return false; + }; + + if matches!(first, '\'' | '’') { + return true; + } + + if !matches!(first, 's' | 'S') { + return false; + } + + match chars.next() { + None => true, + Some(c) => c.is_whitespace() || is_closing_punctuation(c) || matches!(c, '*' | '_'), + } +} + +fn is_closing_punctuation(c: char) -> bool { + matches!( + c, + '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '’' | '"' | '”' + ) +} + +fn is_opening_punctuation(c: char) -> bool { + matches!(c, '(' | '[' | '{' | '"' | '“') } /// Collect raw text content (no markdown formatting). @@ -1606,4 +1666,39 @@ mod tests { "collapse_whitespace stripped 6-space indent: {output}" ); } + + #[test] + fn text_after_inline_element_keeps_separator() { + // Reuters-style markup: agoTanker crosses... + // The "ago" text node sits between two element children. Without a + // separator check on the Text branch, "ago" + "Tanker" would smash + // together as "agoTanker". + let html = r#"
3hagoTanker crosses Strait
"#; + let (md, _, _) = convert_html(html, None); + assert!( + !md.contains("agoTanker"), + "Element->Text->Element smashed together: {md}" + ); + } + + #[test] + fn punctuation_after_inline_element_stays_attached() { + let html = r#"

Hello, world. Use package.json.

"#; + let (md, _, _) = convert_html(html, None); + assert!(md.contains("Hello, world"), "punctuation detached: {md}"); + assert!( + md.contains("`package.json`."), + "code punctuation detached: {md}" + ); + } + + #[test] + fn inline_code_suffix_stays_attached() { + let html = r#"

NullPointerExceptions are common.

"#; + let (md, _, _) = convert_html(html, None); + assert!( + md.contains("[`NullPointerException`](https://example.com)*s* are common"), + "code suffix detached: {md}" + ); + } } From af96628dc9c3ca3ba7f428967c49f0f668eda8e8 Mon Sep 17 00:00:00 2001 From: Valerio <88933932+0xMassi@users.noreply.github.com> Date: Sun, 10 May 2026 22:44:57 +0200 Subject: [PATCH 13/49] Revise README for clarity and updated content Updated the README to reflect changes in the project description, banner image size, and various content sections. Enhanced clarity on features and usage. --- README.md | 584 +++++++++++++++++++++++++----------------------------- 1 file changed, 275 insertions(+), 309 deletions(-) diff --git a/README.md b/README.md index 7d936c6..a663511 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@

- webclaw + webclaw

webclaw

- The fastest web scraper for AI agents.
- 67% fewer tokens. Sub-millisecond extraction. Zero browser overhead. + Turn websites into clean markdown, JSON, and LLM-ready context.
+ CLI, MCP server, REST API, and SDKs for AI agents and RAG pipelines.

@@ -17,64 +17,58 @@ License npm installs

+

Discord X / Twitter - Website + Hosted webclaw Docs

---- -

- Claude Code: web_fetch gets 403, webclaw extracts successfully -
- Claude Code's built-in web_fetch → 403 Forbidden. webclaw → clean markdown. + webclaw extracting clean markdown from a page

--- -Your AI agent calls `fetch()` and gets a 403. Or 142KB of raw HTML that burns through your token budget. **webclaw fixes both.** +Most web scraping tools give your agent one of two bad outputs: -It extracts clean, structured content from any URL using Chrome-level TLS fingerprinting — no headless browser, no Selenium, no Puppeteer. Output is optimized for LLMs: **67% fewer tokens** than raw HTML, with metadata, links, and images preserved. +- a blocked page, login wall, or empty app shell +- raw HTML full of nav, scripts, styling, ads, and duplicated boilerplate +[webclaw.io](https://webclaw.io) is the hosted web extraction API for webclaw. This repo contains the open-source CLI, MCP server, extraction engine, and self-hostable server. + +webclaw turns a URL into clean content your tools can actually use. + +```bash +webclaw https://example.com --format markdown ``` - Raw HTML webclaw -┌──────────────────────────────────┐ ┌──────────────────────────────────┐ -│
│ │ # Breaking: AI Breakthrough │ -│