From 615f3266603a7915b1e898c74ad1eb3a895e6c2f Mon Sep 17 00:00:00 2001 From: Valerio Date: Mon, 4 May 2026 21:52:49 +0200 Subject: [PATCH 01/55] docs: update changelog for brand extraction --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index afec609..01e4612 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.5.8] — 2026-05-04 + +### Fixed +- Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. + +--- + ## [0.5.7] — 2026-04-30 ### Security From a542e45768d54dc7f028485df7d18b6d8954b5e7 Mon Sep 17 00:00:00 2001 From: Justin Levine <20596508+jal-co@users.noreply.github.com> Date: Tue, 5 May 2026 02:17:21 -0700 Subject: [PATCH 02/55] docs: refresh README badges Replace README badges with shieldcn-styled badges. --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 623a4d3..4362d35 100644 --- a/README.md +++ b/README.md @@ -12,16 +12,16 @@

- Stars - Version - License - npm installs + Stars + Version + License + npm installs

- Discord - X / Twitter - Website - Docs + Discord + X / Twitter + Website + Docs

--- From a1242a1c1d116c142c6a98ee18e27f50a90d201d Mon Sep 17 00:00:00 2001 From: Valerio Date: Tue, 5 May 2026 11:18:58 +0200 Subject: [PATCH 03/55] docs: credit README badge refresh --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01e4612..53f636f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). ### Fixed - Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. +### Docs +- Refreshed the README badges with a cleaner shieldcn style. Thanks to Justin Levine (`@jal-co`) for the contribution, and shout-out to his open-source [shieldcn](https://github.com/jal-co/shieldcn) project. + --- ## [0.5.7] — 2026-04-30 From 513b0e493eaa7a7e47f5cb44880bb837be312477 Mon Sep 17 00:00:00 2001 From: SURYANSH MISHRA Date: Tue, 5 May 2026 11:38:30 +0200 Subject: [PATCH 04/55] ci: add Windows release artifacts Closes #34 --- .github/workflows/release.yml | 36 +++++++++++++++++++++++++++-------- CHANGELOG.md | 3 +++ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4c4c241..b2ea54a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -27,6 +27,8 @@ jobs: os: ubuntu-latest - target: aarch64-unknown-linux-gnu os: ubuntu-latest + - target: x86_64-pc-windows-msvc + os: windows-latest steps: - uses: actions/checkout@v4 @@ -57,6 +59,12 @@ jobs: if: matrix.target != 'aarch64-unknown-linux-gnu' && runner.os == 'Linux' run: sudo apt-get update && sudo apt-get install -y cmake + - name: Install NASM (Windows) + if: runner.os == 'Windows' + run: | + choco install nasm -y + echo "C:\Program Files\NASM" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + - name: Build run: cargo build --release --target ${{ matrix.target }} @@ -71,12 +79,22 @@ jobs: # don't repeat that mistake. If a future binary gets renamed or # removed, this step should scream, not quietly publish an # incomplete release. - cp target/${{ matrix.target }}/release/webclaw "$staging/" - cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/" - cp target/${{ matrix.target }}/release/webclaw-server "$staging/" - cp README.md LICENSE "$staging/" - tar czf "$staging.tar.gz" "$staging" - echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV + + if [[ "${{ matrix.os }}" == "windows-latest" ]]; then + cp target/${{ matrix.target }}/release/webclaw.exe "$staging/" + cp target/${{ matrix.target }}/release/webclaw-mcp.exe "$staging/" + cp target/${{ matrix.target }}/release/webclaw-server.exe "$staging/" + cp README.md LICENSE "$staging/" + 7z a -tzip "$staging.zip" "$staging" + echo "ASSET=$staging.zip" >> $GITHUB_ENV + else + cp target/${{ matrix.target }}/release/webclaw "$staging/" + cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/" + cp target/${{ matrix.target }}/release/webclaw-server "$staging/" + cp README.md LICENSE "$staging/" + tar czf "$staging.tar.gz" "$staging" + echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV + fi - name: Upload artifact uses: actions/upload-artifact@v4 @@ -99,7 +117,8 @@ jobs: run: | cd artifacts find . -name '*.tar.gz' -exec mv {} . \; - sha256sum *.tar.gz > SHA256SUMS + find . -name '*.zip' -exec mv {} . \; + sha256sum *.tar.gz *.zip > SHA256SUMS 2>/dev/null || sha256sum * > SHA256SUMS cat SHA256SUMS - name: Create GitHub Release @@ -108,6 +127,7 @@ jobs: generate_release_notes: true files: | artifacts/*.tar.gz + artifacts/*.zip artifacts/SHA256SUMS docker: @@ -181,7 +201,7 @@ jobs: tag="${GITHUB_REF#refs/tags/}" base="https://github.com/0xMassi/webclaw/releases/download/${tag}" - # Download all 4 tarballs and compute SHAs + # Download all tarballs (Linux + macOS) and compute SHAs for target in aarch64-apple-darwin x86_64-apple-darwin aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu; do curl -sSL "${base}/webclaw-${tag}-${target}.tar.gz" -o "${target}.tar.gz" done diff --git a/CHANGELOG.md b/CHANGELOG.md index 53f636f..4e2a0ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). ## [0.5.8] — 2026-05-04 +### Added +- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. + ### Fixed - Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. From 86183b11e4e4e8e695836a6b2b042f3df0994985 Mon Sep 17 00:00:00 2001 From: Valerio Date: Tue, 5 May 2026 11:44:07 +0200 Subject: [PATCH 05/55] docs: credit Windows release contribution --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e2a0ee..63d163f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). ## [0.5.8] — 2026-05-04 ### Added -- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. +- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. Thanks to Suryansh Mishra (`@notrealsuryansh`) for the contribution. ### Fixed - Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. From a3aa4bce6f7a9a4d1b4d3e8bdb78edea75042a73 Mon Sep 17 00:00:00 2001 From: Valerio Date: Wed, 6 May 2026 11:36:53 +0200 Subject: [PATCH 06/55] fix: support LLM provider compatibility options Closes #36 --- CHANGELOG.md | 1 + README.md | 3 + crates/webclaw-cli/src/main.rs | 5 +- crates/webclaw-llm/src/chain.rs | 2 +- crates/webclaw-llm/src/providers/anthropic.rs | 61 +++++++- crates/webclaw-llm/src/providers/openai.rs | 137 ++++++++++++++++-- 6 files changed, 193 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 63d163f..8e30acd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). ### Added - GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. Thanks to Suryansh Mishra (`@notrealsuryansh`) for the contribution. +- LLM providers now support `ANTHROPIC_BASE_URL` for Anthropic-compatible proxies, plus an `OPENAI_RESPONSE_FORMAT_TYPE` override for OpenAI-compatible backends such as LM Studio. Thanks to Toti (`@Toti330`) for the report. ### Fixed - Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. diff --git a/README.md b/README.md index 4362d35..79758f0 100644 --- a/README.md +++ b/README.md @@ -358,7 +358,10 @@ webclaw/ | `WEBCLAW_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) | | `OLLAMA_HOST` | Ollama URL for local LLM features (default: `http://localhost:11434`) | | `OPENAI_API_KEY` | OpenAI API key for LLM features | +| `OPENAI_BASE_URL` | OpenAI-compatible base URL (default: `https://api.openai.com/v1`) | +| `OPENAI_RESPONSE_FORMAT_TYPE` | JSON-mode response format for OpenAI-compatible backends: `json_object` (default), `json_schema`, or `text`. Use `text` or `json_schema` for LM Studio. | | `ANTHROPIC_API_KEY` | Anthropic API key for LLM features | +| `ANTHROPIC_BASE_URL` | Anthropic-compatible base URL (default: `https://api.anthropic.com/v1`) | | `WEBCLAW_PROXY` | Single proxy URL | | `WEBCLAW_PROXY_FILE` | Path to proxy pool file | diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index e97f15d..a45bce8 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -260,7 +260,7 @@ struct Cli { #[arg(long, env = "WEBCLAW_LLM_MODEL")] llm_model: Option, - /// Override the LLM base URL (Ollama or OpenAI-compatible) + /// Override the LLM base URL (Ollama, OpenAI-compatible, or Anthropic-compatible) #[arg(long, env = "WEBCLAW_LLM_BASE_URL")] llm_base_url: Option, @@ -1919,8 +1919,9 @@ async fn build_llm_provider(cli: &Cli) -> Result, String> { Ok(Box::new(provider)) } "anthropic" => { - let provider = webclaw_llm::providers::anthropic::AnthropicProvider::new( + let provider = webclaw_llm::providers::anthropic::AnthropicProvider::with_base_url( None, + cli.llm_base_url.clone(), cli.llm_model.clone(), ) .ok_or("ANTHROPIC_API_KEY not set")?; diff --git a/crates/webclaw-llm/src/chain.rs b/crates/webclaw-llm/src/chain.rs index 314bf2a..86b0101 100644 --- a/crates/webclaw-llm/src/chain.rs +++ b/crates/webclaw-llm/src/chain.rs @@ -34,7 +34,7 @@ impl ProviderChain { providers.push(Box::new(openai)); } - if let Some(anthropic) = AnthropicProvider::new(None, None) { + if let Some(anthropic) = AnthropicProvider::with_base_url(None, None, None) { debug!("anthropic configured, adding to chain"); providers.push(Box::new(anthropic)); } diff --git a/crates/webclaw-llm/src/providers/anthropic.rs b/crates/webclaw-llm/src/providers/anthropic.rs index 71ca1f9..e6e43c8 100644 --- a/crates/webclaw-llm/src/providers/anthropic.rs +++ b/crates/webclaw-llm/src/providers/anthropic.rs @@ -10,23 +10,38 @@ use crate::provider::{CompletionRequest, LlmProvider}; use super::load_api_key; -const ANTHROPIC_API_URL: &str = "https://api.anthropic.com/v1/messages"; +const DEFAULT_ANTHROPIC_BASE_URL: &str = "https://api.anthropic.com/v1"; const ANTHROPIC_VERSION: &str = "2023-06-01"; pub struct AnthropicProvider { client: reqwest::Client, key: String, + base_url: String, default_model: String, } impl AnthropicProvider { /// Returns `None` if no API key is available (param or env). pub fn new(key_override: Option, model: Option) -> Option { + Self::with_base_url(key_override, None, model) + } + + /// Returns `None` if no API key is available (param or env). + pub fn with_base_url( + key_override: Option, + base_url: Option, + model: Option, + ) -> Option { let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?; Some(Self { client: reqwest::Client::new(), key, + base_url: base_url + .or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok()) + .unwrap_or_else(|| DEFAULT_ANTHROPIC_BASE_URL.into()) + .trim_end_matches('/') + .to_string(), default_model: model.unwrap_or_else(|| "claude-sonnet-4-20250514".into()), }) } @@ -34,6 +49,14 @@ impl AnthropicProvider { pub fn default_model(&self) -> &str { &self.default_model } + + fn messages_url(&self) -> String { + if self.base_url.ends_with("/messages") { + self.base_url.clone() + } else { + format!("{}/messages", self.base_url) + } + } } #[async_trait] @@ -74,7 +97,7 @@ impl LlmProvider for AnthropicProvider { let resp = self .client - .post(ANTHROPIC_API_URL) + .post(self.messages_url()) .header("x-api-key", &self.key) .header("anthropic-version", ANTHROPIC_VERSION) .header("content-type", "application/json") @@ -135,6 +158,11 @@ mod tests { assert_eq!(provider.name(), "anthropic"); assert_eq!(provider.default_model, "claude-sonnet-4-20250514"); assert_eq!(provider.key, "sk-ant-test"); + assert_eq!(provider.base_url, "https://api.anthropic.com/v1"); + assert_eq!( + provider.messages_url(), + "https://api.anthropic.com/v1/messages" + ); } #[test] @@ -151,6 +179,35 @@ mod tests { assert_eq!(provider.default_model(), "claude-sonnet-4-20250514"); } + #[test] + fn custom_base_url_appends_messages_path() { + let provider = AnthropicProvider::with_base_url( + Some("sk-ant-test".into()), + Some("https://proxy.example.test/anthropic/v1/".into()), + None, + ) + .unwrap(); + assert_eq!(provider.base_url, "https://proxy.example.test/anthropic/v1"); + assert_eq!( + provider.messages_url(), + "https://proxy.example.test/anthropic/v1/messages" + ); + } + + #[test] + fn custom_full_messages_url_is_not_doubled() { + let provider = AnthropicProvider::with_base_url( + Some("sk-ant-test".into()), + Some("https://proxy.example.test/v1/messages".into()), + None, + ) + .unwrap(); + assert_eq!( + provider.messages_url(), + "https://proxy.example.test/v1/messages" + ); + } + // Env var fallback tests mutate process-global state and race with parallel tests. // The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed: // cargo test -p webclaw-llm env_var -- --ignored --test-threads=1 diff --git a/crates/webclaw-llm/src/providers/openai.rs b/crates/webclaw-llm/src/providers/openai.rs index 6422cc4..3780d8f 100644 --- a/crates/webclaw-llm/src/providers/openai.rs +++ b/crates/webclaw-llm/src/providers/openai.rs @@ -13,6 +13,50 @@ pub struct OpenAiProvider { key: String, base_url: String, default_model: String, + response_format: OpenAiResponseFormat, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum OpenAiResponseFormat { + JsonObject, + JsonSchema, + Text, +} + +impl OpenAiResponseFormat { + fn from_env() -> Self { + std::env::var("OPENAI_RESPONSE_FORMAT_TYPE") + .ok() + .and_then(|value| Self::parse(&value)) + .unwrap_or(Self::JsonObject) + } + + fn parse(value: &str) -> Option { + match value.trim().to_ascii_lowercase().as_str() { + "" | "json_object" => Some(Self::JsonObject), + "json_schema" => Some(Self::JsonSchema), + "text" => Some(Self::Text), + _ => None, + } + } + + fn as_response_format(self) -> serde_json::Value { + match self { + Self::JsonObject => json!({ "type": "json_object" }), + Self::JsonSchema => json!({ + "type": "json_schema", + "json_schema": { + "name": "webclaw_response", + "schema": { + "type": "object", + "additionalProperties": true + }, + "strict": false + } + }), + Self::Text => json!({ "type": "text" }), + } + } } impl OpenAiProvider { @@ -31,23 +75,15 @@ impl OpenAiProvider { .or_else(|| std::env::var("OPENAI_BASE_URL").ok()) .unwrap_or_else(|| "https://api.openai.com/v1".into()), default_model: model.unwrap_or_else(|| "gpt-4o-mini".into()), + response_format: OpenAiResponseFormat::from_env(), }) } pub fn default_model(&self) -> &str { &self.default_model } -} - -#[async_trait] -impl LlmProvider for OpenAiProvider { - async fn complete(&self, request: &CompletionRequest) -> Result { - let model = if request.model.is_empty() { - &self.default_model - } else { - &request.model - }; + fn request_body(&self, request: &CompletionRequest, model: &str) -> serde_json::Value { let messages: Vec = request .messages .iter() @@ -60,7 +96,7 @@ impl LlmProvider for OpenAiProvider { }); if request.json_mode { - body["response_format"] = json!({ "type": "json_object" }); + body["response_format"] = self.response_format.as_response_format(); } if let Some(temp) = request.temperature { body["temperature"] = json!(temp); @@ -69,6 +105,21 @@ impl LlmProvider for OpenAiProvider { body["max_tokens"] = json!(max); } + body + } +} + +#[async_trait] +impl LlmProvider for OpenAiProvider { + async fn complete(&self, request: &CompletionRequest) -> Result { + let model = if request.model.is_empty() { + &self.default_model + } else { + &request.model + }; + + let body = self.request_body(request, model); + let url = format!("{}/chat/completions", self.base_url); let resp = self .client @@ -136,6 +187,7 @@ mod tests { assert_eq!(provider.default_model, "gpt-4o-mini"); assert_eq!(provider.base_url, "https://api.openai.com/v1"); assert_eq!(provider.key, "test-key-123"); + assert_eq!(provider.response_format, OpenAiResponseFormat::JsonObject); } #[test] @@ -161,6 +213,69 @@ mod tests { assert_eq!(provider.default_model(), "gpt-4o-mini"); } + #[test] + fn json_mode_defaults_to_openai_json_object() { + let provider = OpenAiProvider::new( + Some("test-key".into()), + Some("https://api.openai.com/v1".into()), + None, + ) + .unwrap(); + let req = CompletionRequest { + model: String::new(), + messages: vec![], + temperature: None, + max_tokens: None, + json_mode: true, + }; + let body = provider.request_body(&req, provider.default_model()); + assert_eq!(body["response_format"], json!({ "type": "json_object" })); + } + + #[test] + fn json_schema_response_format_for_compatible_backends() { + let mut provider = OpenAiProvider::new( + Some("test-key".into()), + Some("http://localhost:1234/v1".into()), + Some("local-model".into()), + ) + .unwrap(); + provider.response_format = OpenAiResponseFormat::JsonSchema; + let req = CompletionRequest { + model: String::new(), + messages: vec![], + temperature: None, + max_tokens: None, + json_mode: true, + }; + let body = provider.request_body(&req, provider.default_model()); + assert_eq!(body["response_format"]["type"], "json_schema"); + assert_eq!( + body["response_format"]["json_schema"]["schema"]["type"], + "object" + ); + } + + #[test] + fn text_response_format_for_lm_studio() { + let mut provider = OpenAiProvider::new( + Some("test-key".into()), + Some("http://localhost:1234/v1".into()), + Some("local-model".into()), + ) + .unwrap(); + provider.response_format = OpenAiResponseFormat::Text; + let req = CompletionRequest { + model: String::new(), + messages: vec![], + temperature: None, + max_tokens: None, + json_mode: true, + }; + let body = provider.request_body(&req, provider.default_model()); + assert_eq!(body["response_format"], json!({ "type": "text" })); + } + // Env var fallback tests mutate process-global state and race with parallel tests. // The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed: // cargo test -p webclaw-llm env_var -- --ignored --test-threads=1 From e6a95f783dd9eea4fe0b34bfc0e8f70bf3ff74f5 Mon Sep 17 00:00:00 2001 From: Valerio Date: Wed, 6 May 2026 11:42:09 +0200 Subject: [PATCH 07/55] chore: bump version to 0.5.9 --- CHANGELOG.md | 8 +++++++- Cargo.lock | 14 +++++++------- Cargo.toml | 2 +- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e30acd..7858ae4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,17 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.5.9] — 2026-05-06 + +### Fixed +- LLM providers now support `ANTHROPIC_BASE_URL` for Anthropic-compatible proxies, plus an `OPENAI_RESPONSE_FORMAT_TYPE` override for OpenAI-compatible backends such as LM Studio. Thanks to Toti (`@Toti330`) for the report. + +--- + ## [0.5.8] — 2026-05-04 ### Added - GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. Thanks to Suryansh Mishra (`@notrealsuryansh`) for the contribution. -- LLM providers now support `ANTHROPIC_BASE_URL` for Anthropic-compatible proxies, plus an `OPENAI_RESPONSE_FORMAT_TYPE` override for OpenAI-compatible backends such as LM Studio. Thanks to Toti (`@Toti330`) for the report. ### Fixed - Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. diff --git a/Cargo.lock b/Cargo.lock index 4a6b90e..e49ccc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3219,7 +3219,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.5.8" +version = "0.5.9" dependencies = [ "clap", "dotenvy", @@ -3240,7 +3240,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.5.8" +version = "0.5.9" dependencies = [ "ego-tree", "once_cell", @@ -3258,7 +3258,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.5.8" +version = "0.5.9" dependencies = [ "async-trait", "bytes", @@ -3284,7 +3284,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.5.8" +version = "0.5.9" dependencies = [ "async-trait", "reqwest", @@ -3297,7 +3297,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.5.8" +version = "0.5.9" dependencies = [ "dirs", "dotenvy", @@ -3317,7 +3317,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.5.8" +version = "0.5.9" dependencies = [ "pdf-extract", "thiserror", @@ -3326,7 +3326,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.5.8" +version = "0.5.9" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index f77595d..12a4b73 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.5.8" +version = "0.5.9" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" From 7f7514395415484e0e9da3ad5178e0578917e09d Mon Sep 17 00:00:00 2001 From: Valerio Date: Wed, 6 May 2026 17:16:35 +0200 Subject: [PATCH 08/55] docs: update hosted api trial copy --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 79758f0..7d936c6 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ It extracts clean, structured content from any URL using Chrome-level TLS finger **Self-host.** Free, AGPL-3.0, runs locally. Get the CLI, MCP server, or REST API in one command. Ships with the 8 core extraction tools: scrape, crawl, map, batch, extract, summarize, diff, brand. -**Hosted API** at **[webclaw.io](https://webclaw.io)**. 500 pages/month free, no card. Adds what self-hosting can't do alone: antibot bypass (Cloudflare, DataDome, WAF), JS rendering, async crawl/research jobs, web search, watches. For when you want it to *just work*. +**Hosted API** at **[webclaw.io](https://webclaw.io)**. Start with a 7-day Starter trial, card required. Adds what self-hosting can't do alone: antibot bypass (Cloudflare, DataDome, WAF), JS rendering, async crawl/research jobs, web search, watches. For when you want it to *just work*. --- From e8ca1417d699d977fd4d08af435758be127e7226 Mon Sep 17 00:00:00 2001 From: devnen Date: Sun, 10 May 2026 15:11:12 +0200 Subject: [PATCH 09/55] Improve --format llm output quality (#37) Improve LLM-format output for modern news and documentation pages. - Filter noisy hydration and low-value page chrome structured data while preserving content-bearing Schema.org records - Fix element/text spacing without detaching punctuation on docs, forums, and reference pages - Remove common accessibility link chrome from LLM text and link labels - Bump workspace version to 0.6.0 and update the changelog Thanks to Nenad Oric (@devnen) for the original PR and contribution. --- CHANGELOG.md | 9 ++ Cargo.lock | 14 +-- Cargo.toml | 2 +- crates/webclaw-core/src/llm/body.rs | 3 + crates/webclaw-core/src/llm/cleanup.rs | 83 ++++++++++++++ crates/webclaw-core/src/llm/links.rs | 25 +++++ crates/webclaw-core/src/llm/mod.rs | 148 ++++++++++++++++++++++++- crates/webclaw-core/src/markdown.rs | 103 ++++++++++++++++- 8 files changed, 371 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7858ae4..025b1db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,15 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.6.0] — 2026-05-10 + +### Fixed +- Improved `--format llm` output quality on modern news and documentation pages. Framework hydration blobs and low-value page chrome structured-data records are now filtered out before they can flood the LLM context, while content-bearing Schema.org records are preserved. Thanks and congrats to Nenad Oric (`@devnen`) for the contribution in PR #37. +- Fixed element-to-text spacing so adjacent inline nodes no longer smash words together, while punctuation stays attached on real pages such as docs, forums, and reference sites. +- Removed common screen-reader-only link chrome such as "opens new tab" from LLM body text and link labels without stripping ordinary prose that happens to mention external links. + +--- + ## [0.5.9] — 2026-05-06 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index e49ccc3..ab23a3f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3219,7 +3219,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.5.9" +version = "0.6.0" dependencies = [ "clap", "dotenvy", @@ -3240,7 +3240,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.5.9" +version = "0.6.0" dependencies = [ "ego-tree", "once_cell", @@ -3258,7 +3258,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.5.9" +version = "0.6.0" dependencies = [ "async-trait", "bytes", @@ -3284,7 +3284,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.5.9" +version = "0.6.0" dependencies = [ "async-trait", "reqwest", @@ -3297,7 +3297,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.5.9" +version = "0.6.0" dependencies = [ "dirs", "dotenvy", @@ -3317,7 +3317,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.5.9" +version = "0.6.0" dependencies = [ "pdf-extract", "thiserror", @@ -3326,7 +3326,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.5.9" +version = "0.6.0" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index 12a4b73..6e87225 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.5.9" +version = "0.6.0" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-core/src/llm/body.rs b/crates/webclaw-core/src/llm/body.rs index 5311121..db2a011 100644 --- a/crates/webclaw-core/src/llm/body.rs +++ b/crates/webclaw-core/src/llm/body.rs @@ -29,6 +29,9 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody { // 0c. Strip leaked JavaScript (framework hydration, self.__wrap_n, etc.) let text = cleanup::strip_leaked_js(&text); + // 0c2. Strip a11y link chrome ("opens new tab", external link hints) + let text = cleanup::strip_a11y_link_chrome(&text); + // 0d. Collapse spaced-out text (CSS animation artifacts like "S t a r t") // Must run before any dedup -- spaced text confuses word-based dedup. let text = cleanup::collapse_spaced_text(&text); diff --git a/crates/webclaw-core/src/llm/cleanup.rs b/crates/webclaw-core/src/llm/cleanup.rs index c8e14ed..dc447a5 100644 --- a/crates/webclaw-core/src/llm/cleanup.rs +++ b/crates/webclaw-core/src/llm/cleanup.rs @@ -146,6 +146,45 @@ pub(crate) fn strip_leaked_js(input: &str) -> String { out } +// --------------------------------------------------------------------------- +// Accessibility link chrome ("opens new tab", "external link") +// --------------------------------------------------------------------------- + +/// Strip screen-reader-only link chrome that bleeds into rendered text. +/// +/// Sites like Reuters wrap external/new-window links with hidden spans +/// like `, opens new tab`. The noise +/// filter can't reliably catch these (no consistent class hook across +/// sites), so they end up duplicated all over the body text. This is a +/// targeted text-level scrub of the most common phrasings. +pub(crate) fn strip_a11y_link_chrome(input: &str) -> String { + static A11Y_PATTERN: Lazy = Lazy::new(|| { + Regex::new( + r"(?i)(?:\s*,\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?|\s+\((?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\)\.?|\s+external link\b\.?$)", + ) + .unwrap() + }); + + let mut out = String::with_capacity(input.len()); + let mut in_code_fence = false; + for (i, line) in input.lines().enumerate() { + if i > 0 { + out.push('\n'); + } + if line.trim().starts_with("```") { + in_code_fence = !in_code_fence; + out.push_str(line); + continue; + } + if in_code_fence { + out.push_str(line); + continue; + } + out.push_str(&A11Y_PATTERN.replace_all(line, "")); + } + out +} + // --------------------------------------------------------------------------- // Spaced-out text collapsing (CSS animation artifacts) // --------------------------------------------------------------------------- @@ -1356,4 +1395,48 @@ mod tests { let input = "```\nImage of something in code\n```"; assert_eq!(strip_alt_text_noise(input), input); } + + #[test] + fn a11y_strips_opens_new_tab() { + let input = "Download the App, opens new tab and Subscribe, opens new tab."; + let out = strip_a11y_link_chrome(input); + assert!(!out.to_lowercase().contains("opens new tab"), "leak: {out}"); + assert!(out.contains("Download the App")); + assert!(out.contains("Subscribe")); + } + + #[test] + fn a11y_strips_external_link_variants() { + let cases = [ + ("Visit our docs, opens external link", "Visit our docs"), + ("Click here, opens in a new window.", "Click here"), + ("More info external link", "More info"), + ]; + for (input, expected_prefix) in cases { + let out = strip_a11y_link_chrome(input); + assert!( + out.starts_with(expected_prefix), + "input={input:?} got={out:?}" + ); + assert!(!out.to_lowercase().contains("opens"), "leak: {out}"); + } + } + + #[test] + fn a11y_preserves_code_blocks() { + let input = "```\nopens new tab is a function\n```\nDownload, opens new tab"; + let out = strip_a11y_link_chrome(input); + assert!( + out.contains("opens new tab is a function"), + "code stripped: {out}" + ); + // Outside the fence, the chrome is removed. + assert!(!out.to_lowercase().contains("download, opens new tab")); + } + + #[test] + fn a11y_preserves_external_link_prose() { + let input = "Researchers found an external link between the two incidents."; + assert_eq!(strip_a11y_link_chrome(input), input); + } } diff --git a/crates/webclaw-core/src/llm/links.rs b/crates/webclaw-core/src/llm/links.rs index 0656aac..3d25179 100644 --- a/crates/webclaw-core/src/llm/links.rs +++ b/crates/webclaw-core/src/llm/links.rs @@ -88,10 +88,19 @@ fn is_noise_link(text: &str, href: &str) -> bool { static MD_MARKERS_RE: Lazy = Lazy::new(|| Regex::new(r"#{1,6}\s+|\*{1,2}|_{1,2}|`").unwrap()); +static A11Y_LABEL_RE: Lazy = Lazy::new(|| { + Regex::new( + r"(?i)(?:\s*,?\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website))\b\.?|\s*,\s*external link\b\.?|\s+external link\b\.?$)", + ) + .unwrap() +}); + /// Clean a link label: strip markdown, dedup repeated phrases, truncate. pub(crate) fn clean_link_label(raw: &str) -> String { // Strip markdown markers let label = MD_MARKERS_RE.replace_all(raw, "").to_string(); + // Strip a11y link chrome ("opens new tab", etc.) + let label = A11Y_LABEL_RE.replace_all(&label, "").to_string(); let label = label.split_whitespace().collect::>().join(" "); // Dedup repeated phrases in label @@ -181,4 +190,20 @@ mod tests { assert!(is_noise_link("user", "https://hn.com/user?id=foo")); assert!(!is_noise_link("Rust docs", "https://rust-lang.org")); } + + #[test] + fn link_label_preserves_external_link_prose() { + assert_eq!( + clean_link_label("Research found an external link between incidents"), + "Research found an external link between incidents" + ); + } + + #[test] + fn link_label_strips_terminal_external_link_chrome() { + assert_eq!( + clean_link_label("Reuters story external link"), + "Reuters story" + ); + } } diff --git a/crates/webclaw-core/src/llm/mod.rs b/crates/webclaw-core/src/llm/mod.rs index 126558f..bc65be6 100644 --- a/crates/webclaw-core/src/llm/mod.rs +++ b/crates/webclaw-core/src/llm/mod.rs @@ -46,15 +46,73 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String { } // -- 4. Structured data (NEXT_DATA, SvelteKit, JSON-LD) -- - if !result.structured_data.is_empty() { - out.push_str("\n\n## Structured Data\n\n```json\n"); - out.push_str(&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default()); - out.push_str("\n```"); + // Only emit useful items: Schema.org records with a meaningful @type, + // and only if the total serialized size stays under a budget. Framework + // hydration blobs (Next.js pageProps full of ad-targeting flags, build + // IDs, schedule paths) explode to hundreds of KB and drown the LLM in + // noise — drop them rather than ship them. + let useful: Vec<_> = result + .structured_data + .iter() + .filter(|v| is_useful_structured_data(v)) + .cloned() + .collect(); + if !useful.is_empty() { + let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default(); + const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024; + if serialized.len() <= STRUCTURED_DATA_MAX_BYTES { + out.push_str("\n\n## Structured Data\n\n```json\n"); + out.push_str(&serialized); + out.push_str("\n```"); + } } out.trim().to_string() } +/// Decide whether a structured-data value carries content worth emitting. +/// +/// Schema.org records with a recognizable content `@type` (Article, NewsArticle, +/// Product, Recipe, FAQPage, HowTo, Event, Person, Organization, BreadcrumbList, +/// VideoObject, JobPosting, etc.) are kept. Generic `WebSite` / `WebPage` / +/// `ItemList` records and Next.js `pageProps`-style blobs without a useful +/// `@type` are dropped — they're almost always navigation chrome or framework +/// hydration state. +fn is_useful_structured_data(v: &serde_json::Value) -> bool { + let Some(obj) = v.as_object() else { + // SvelteKit can emit compact arrays of page data. Keep those if they + // are small enough to be useful, while still dropping giant hydration + // arrays under the same budget as untyped objects. + if v.is_array() { + let serialized = serde_json::to_string(v).unwrap_or_default(); + return serialized.len() <= 4 * 1024; + } + return false; + }; + // JSON-LD: @type drives the decision. + if let Some(t) = obj.get("@type") { + let types: Vec = match t { + serde_json::Value::String(s) => vec![s.to_ascii_lowercase()], + serde_json::Value::Array(a) => a + .iter() + .filter_map(|x| x.as_str()) + .map(str::to_ascii_lowercase) + .collect(), + _ => Vec::new(), + }; + if types.is_empty() { + return false; + } + // Drop low-info chrome types. + const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"]; + return types.iter().any(|t| !DROP_TYPES.iter().any(|d| t == d)); + } + // Next.js pageProps / SvelteKit data without @type: keep only if compact. + // Anything over ~4KB is almost certainly hydration state, not content. + let serialized = serde_json::to_string(v).unwrap_or_default(); + serialized.len() <= 4 * 1024 +} + // --------------------------------------------------------------------------- // Integration tests that exercise the full pipeline through to_llm_text // --------------------------------------------------------------------------- @@ -700,4 +758,86 @@ mod tests { assert!(out.contains("Some content"), "Content before lost: {out}"); assert!(out.contains("More content"), "Content after lost: {out}"); } + + // -- Structured-data gating tests -- + + fn make_result_with_structured(values: Vec) -> ExtractionResult { + let mut r = make_result("# Body"); + r.structured_data = values; + r + } + + #[test] + fn structured_data_drops_chrome_types() { + // WebSite/WebPage records are framework chrome — should be dropped. + let r = make_result_with_structured(vec![serde_json::json!({ + "@type": "WebSite", + "name": "Example", + "url": "https://example.com" + })]); + let out = to_llm_text(&r, None); + assert!( + !out.contains("## Structured Data"), + "WebSite chrome leaked into output: {out}" + ); + } + + #[test] + fn structured_data_keeps_article_types() { + let r = make_result_with_structured(vec![serde_json::json!({ + "@type": "NewsArticle", + "headline": "Big news", + "datePublished": "2026-05-10" + })]); + let out = to_llm_text(&r, None); + assert!( + out.contains("## Structured Data"), + "NewsArticle dropped: {out}" + ); + assert!(out.contains("Big news")); + } + + #[test] + fn structured_data_drops_oversized_blob() { + // 32KB pageProps-style blob with no @type — should be dropped. + let big = "x".repeat(32 * 1024); + let r = make_result_with_structured(vec![serde_json::json!({ + "buildId": "abc", + "isFallback": false, + "noise": big + })]); + let out = to_llm_text(&r, None); + assert!( + !out.contains("## Structured Data"), + "Oversized untyped blob leaked: len={}", + out.len() + ); + } + + #[test] + fn structured_data_keeps_compact_untyped() { + // Small untyped record (e.g. a parsed pageProps with real content) — keep. + let r = make_result_with_structured(vec![serde_json::json!({ + "title": "Hi", + "body": "small enough to keep" + })]); + let out = to_llm_text(&r, None); + assert!( + out.contains("## Structured Data"), + "Compact untyped dropped: {out}" + ); + } + + #[test] + fn structured_data_keeps_compact_untyped_array() { + // SvelteKit can emit compact arrays rather than objects. + let r = make_result_with_structured(vec![serde_json::json!([ + { "title": "Hi", "body": "small array item" } + ])]); + let out = to_llm_text(&r, None); + assert!( + out.contains("small array item"), + "Compact untyped array dropped: {out}" + ); + } } diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs index d0a2c23..2699166 100644 --- a/crates/webclaw-core/src/markdown.rs +++ b/crates/webclaw-core/src/markdown.rs @@ -320,6 +320,9 @@ fn children_to_md( } } Node::Text(text) => { + if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) { + out.push(' '); + } out.push_str(text); } _ => {} @@ -350,6 +353,9 @@ fn inline_text( } } Node::Text(text) => { + if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) { + out.push(' '); + } out.push_str(text); } _ => {} @@ -361,11 +367,65 @@ fn inline_text( /// Check whether a space is needed between two adjacent chunks of output. /// Returns true when the left side doesn't end with whitespace and the right -/// side doesn't start with whitespace — i.e., two words would be mashed together. +/// side doesn't start with whitespace, except around punctuation that should +/// bind to the adjacent token. fn needs_separator(left: &str, right: &str) -> bool { - let l = left.as_bytes().last().copied().unwrap_or(b' '); - let r = right.as_bytes().first().copied().unwrap_or(b' '); - !l.is_ascii_whitespace() && !r.is_ascii_whitespace() + let l = left.chars().next_back().unwrap_or(' '); + let r = right.chars().next().unwrap_or(' '); + + if l.is_whitespace() || r.is_whitespace() { + return false; + } + + // Do not create "word ," / "word )" / "word 's" artifacts. + if is_closing_punctuation(r) { + return false; + } + + // Do not create "( word" / "[ 1" artifacts. + if is_opening_punctuation(l) { + return false; + } + + // Common inline-code suffixes: `Option`s, `x`'s. Treat them like a + // single token rather than separating the text node. + if matches!(l, '`' | ')') && starts_with_inline_code_suffix(right) { + return false; + } + + true +} + +fn starts_with_inline_code_suffix(s: &str) -> bool { + let trimmed = s.trim_start_matches(['*', '_']); + let mut chars = trimmed.chars(); + let Some(first) = chars.next() else { + return false; + }; + + if matches!(first, '\'' | '’') { + return true; + } + + if !matches!(first, 's' | 'S') { + return false; + } + + match chars.next() { + None => true, + Some(c) => c.is_whitespace() || is_closing_punctuation(c) || matches!(c, '*' | '_'), + } +} + +fn is_closing_punctuation(c: char) -> bool { + matches!( + c, + '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '’' | '"' | '”' + ) +} + +fn is_opening_punctuation(c: char) -> bool { + matches!(c, '(' | '[' | '{' | '"' | '“') } /// Collect raw text content (no markdown formatting). @@ -1606,4 +1666,39 @@ mod tests { "collapse_whitespace stripped 6-space indent: {output}" ); } + + #[test] + fn text_after_inline_element_keeps_separator() { + // Reuters-style markup: agoTanker crosses... + // The "ago" text node sits between two element children. Without a + // separator check on the Text branch, "ago" + "Tanker" would smash + // together as "agoTanker". + let html = r#"
3hagoTanker crosses Strait
"#; + let (md, _, _) = convert_html(html, None); + assert!( + !md.contains("agoTanker"), + "Element->Text->Element smashed together: {md}" + ); + } + + #[test] + fn punctuation_after_inline_element_stays_attached() { + let html = r#"

Hello, world. Use package.json.

"#; + let (md, _, _) = convert_html(html, None); + assert!(md.contains("Hello, world"), "punctuation detached: {md}"); + assert!( + md.contains("`package.json`."), + "code punctuation detached: {md}" + ); + } + + #[test] + fn inline_code_suffix_stays_attached() { + let html = r#"

NullPointerExceptions are common.

"#; + let (md, _, _) = convert_html(html, None); + assert!( + md.contains("[`NullPointerException`](https://example.com)*s* are common"), + "code suffix detached: {md}" + ); + } } From af96628dc9c3ca3ba7f428967c49f0f668eda8e8 Mon Sep 17 00:00:00 2001 From: Valerio <88933932+0xMassi@users.noreply.github.com> Date: Sun, 10 May 2026 22:44:57 +0200 Subject: [PATCH 10/55] Revise README for clarity and updated content Updated the README to reflect changes in the project description, banner image size, and various content sections. Enhanced clarity on features and usage. --- README.md | 584 +++++++++++++++++++++++++----------------------------- 1 file changed, 275 insertions(+), 309 deletions(-) diff --git a/README.md b/README.md index 7d936c6..a663511 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@

- webclaw + webclaw

webclaw

- The fastest web scraper for AI agents.
- 67% fewer tokens. Sub-millisecond extraction. Zero browser overhead. + Turn websites into clean markdown, JSON, and LLM-ready context.
+ CLI, MCP server, REST API, and SDKs for AI agents and RAG pipelines.

@@ -17,64 +17,58 @@ License npm installs

+

Discord X / Twitter - Website + Hosted webclaw Docs

---- -

- Claude Code: web_fetch gets 403, webclaw extracts successfully -
- Claude Code's built-in web_fetch → 403 Forbidden. webclaw → clean markdown. + webclaw extracting clean markdown from a page

--- -Your AI agent calls `fetch()` and gets a 403. Or 142KB of raw HTML that burns through your token budget. **webclaw fixes both.** +Most web scraping tools give your agent one of two bad outputs: -It extracts clean, structured content from any URL using Chrome-level TLS fingerprinting — no headless browser, no Selenium, no Puppeteer. Output is optimized for LLMs: **67% fewer tokens** than raw HTML, with metadata, links, and images preserved. +- a blocked page, login wall, or empty app shell +- raw HTML full of nav, scripts, styling, ads, and duplicated boilerplate +[webclaw.io](https://webclaw.io) is the hosted web extraction API for webclaw. This repo contains the open-source CLI, MCP server, extraction engine, and self-hostable server. + +webclaw turns a URL into clean content your tools can actually use. + +```bash +webclaw https://example.com --format markdown ``` - Raw HTML webclaw -┌──────────────────────────────────┐ ┌──────────────────────────────────┐ -│
│ │ # Breaking: AI Breakthrough │ -│