diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000..650984e
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,13 @@
+github: [0xMassi]
+patreon:
+open_collective:
+ko_fi:
+tidelift:
+community_bridge:
+liberapay:
+issuehunt:
+lfx_crowdfunding:
+polar:
+buy_me_a_coffee:
+thanks_dev:
+custom:
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0b14bcc..bf03cee 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,7 +14,7 @@ jobs:
name: Test
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v5
- uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
- run: cargo test --workspace
@@ -23,7 +23,7 @@ jobs:
name: Lint
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v5
- uses: dtolnay/rust-toolchain@stable
with:
components: clippy, rustfmt
@@ -31,11 +31,26 @@ jobs:
- run: cargo fmt --check --all
- run: cargo clippy --all -- -D warnings
+ wasm:
+ name: WASM
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v5
+ - uses: dtolnay/rust-toolchain@stable
+ with:
+ targets: wasm32-unknown-unknown
+ - uses: Swatinem/rust-cache@v2
+ # webclaw-core must stay WASM-safe (zero network deps, no threads).
+ # Check both with and without default features so the quickjs gate
+ # can't regress.
+ - run: cargo check --target wasm32-unknown-unknown -p webclaw-core
+ - run: cargo check --target wasm32-unknown-unknown -p webclaw-core --no-default-features
+
docs:
name: Docs
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v5
- uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
- run: cargo doc --no-deps --workspace
diff --git a/.github/workflows/deps.yml b/.github/workflows/deps.yml
index 29e851b..7d455cc 100644
--- a/.github/workflows/deps.yml
+++ b/.github/workflows/deps.yml
@@ -14,7 +14,7 @@ jobs:
name: Update webclaw-tls dependencies
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v5
with:
token: ${{ secrets.SYNC_PAT }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4c4c241..7ad94a3 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -5,14 +5,15 @@ on:
tags: ["v*"]
permissions:
- contents: write
- packages: write
+ contents: read
env:
CARGO_TERM_COLOR: always
jobs:
build:
+ permissions:
+ contents: read
name: Build ${{ matrix.target }}
runs-on: ${{ matrix.os }}
strategy:
@@ -27,9 +28,11 @@ jobs:
os: ubuntu-latest
- target: aarch64-unknown-linux-gnu
os: ubuntu-latest
+ - target: x86_64-pc-windows-msvc
+ os: windows-latest
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v5
- uses: dtolnay/rust-toolchain@stable
with:
@@ -57,6 +60,12 @@ jobs:
if: matrix.target != 'aarch64-unknown-linux-gnu' && runner.os == 'Linux'
run: sudo apt-get update && sudo apt-get install -y cmake
+ - name: Install NASM (Windows)
+ if: runner.os == 'Windows'
+ run: |
+ choco install nasm -y
+ echo "C:\Program Files\NASM" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+
- name: Build
run: cargo build --release --target ${{ matrix.target }}
@@ -71,15 +80,25 @@ jobs:
# don't repeat that mistake. If a future binary gets renamed or
# removed, this step should scream, not quietly publish an
# incomplete release.
- cp target/${{ matrix.target }}/release/webclaw "$staging/"
- cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/"
- cp target/${{ matrix.target }}/release/webclaw-server "$staging/"
- cp README.md LICENSE "$staging/"
- tar czf "$staging.tar.gz" "$staging"
- echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV
+
+ if [[ "${{ matrix.os }}" == "windows-latest" ]]; then
+ cp target/${{ matrix.target }}/release/webclaw.exe "$staging/"
+ cp target/${{ matrix.target }}/release/webclaw-mcp.exe "$staging/"
+ cp target/${{ matrix.target }}/release/webclaw-server.exe "$staging/"
+ cp README.md LICENSE "$staging/"
+ 7z a -tzip "$staging.zip" "$staging"
+ echo "ASSET=$staging.zip" >> $GITHUB_ENV
+ else
+ cp target/${{ matrix.target }}/release/webclaw "$staging/"
+ cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/"
+ cp target/${{ matrix.target }}/release/webclaw-server "$staging/"
+ cp README.md LICENSE "$staging/"
+ tar czf "$staging.tar.gz" "$staging"
+ echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV
+ fi
- name: Upload artifact
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v5
with:
name: ${{ matrix.target }}
path: ${{ env.ASSET }}
@@ -88,10 +107,10 @@ jobs:
name: Release
needs: build
runs-on: ubuntu-latest
+ permissions:
+ contents: write
steps:
- - uses: actions/checkout@v4
-
- - uses: actions/download-artifact@v4
+ - uses: actions/download-artifact@v5
with:
path: artifacts
@@ -99,23 +118,31 @@ jobs:
run: |
cd artifacts
find . -name '*.tar.gz' -exec mv {} . \;
- sha256sum *.tar.gz > SHA256SUMS
+ find . -name '*.zip' -exec mv {} . \;
+ sha256sum *.tar.gz *.zip > SHA256SUMS 2>/dev/null || sha256sum * > SHA256SUMS
cat SHA256SUMS
- name: Create GitHub Release
- uses: softprops/action-gh-release@v2
- with:
- generate_release_notes: true
- files: |
- artifacts/*.tar.gz
- artifacts/SHA256SUMS
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: |
+ tag="${GITHUB_REF#refs/tags/}"
+ gh release create "$tag" \
+ artifacts/*.tar.gz \
+ artifacts/*.zip \
+ artifacts/SHA256SUMS \
+ --repo "$GITHUB_REPOSITORY" \
+ --generate-notes
docker:
name: Docker
needs: release
runs-on: ubuntu-latest
+ permissions:
+ contents: read
+ packages: write
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v5
- uses: docker/setup-qemu-action@v3
with:
@@ -173,6 +200,8 @@ jobs:
name: Update Homebrew
needs: [release, docker]
runs-on: ubuntu-latest
+ permissions:
+ contents: read
steps:
- name: Compute all checksums and update formula
env:
@@ -181,7 +210,7 @@ jobs:
tag="${GITHUB_REF#refs/tags/}"
base="https://github.com/0xMassi/webclaw/releases/download/${tag}"
- # Download all 4 tarballs and compute SHAs
+ # Download all tarballs (Linux + macOS) and compute SHAs
for target in aarch64-apple-darwin x86_64-apple-darwin aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu; do
curl -sSL "${base}/webclaw-${tag}-${target}.tar.gz" -o "${target}.tar.gz"
done
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7cfd1e5..856cc11 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,132 @@
All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/).
+## [0.6.5] — 2026-06-04
+
+### Changed
+- Reddit threads extract reliably again. The old anonymous JSON endpoint is no longer available, so webclaw now reads old.reddit.com directly without an API key or JavaScript. You get the post plus the full nested comment tree, with authors, scores, timestamps, and reply nesting preserved. Comment text keeps its links and code blocks, hidden scores are reported as unknown rather than zero, and deleted comments stay in place so their replies aren't lost.
+
+---
+
+## [0.6.4] — 2026-05-19
+
+### Added
+- API surface discovery: a new module extracts the API endpoints embedded in a page's inline scripts and linked JavaScript bundles. It surfaces relative REST paths, absolute URLs, GraphQL operations, and WebSocket endpoints that a sitemap alone cannot reveal. A built-in noise filter drops schema.org and json-schema.org references, bare framework paths, and other non-API matches so the result stays focused on the real surface.
+
+---
+
+## [0.6.3] — 2026-05-19
+
+### Fixed
+- Hardened resource and path-safety limits across the CLI, MCP server, and self-hosted API: oversized or highly compressed responses are capped while streaming, deeply nested page data can no longer exhaust memory, output filenames stay inside the chosen directory, webhook URLs are validated like every other fetch, and multibyte search queries no longer crash slug generation.
+
+---
+
+## [0.6.2] — 2026-05-18
+
+### Fixed
+- Cleaned up `--format llm` output on noisy news and documentation pages. Comment-count links, bare page-number paragraphs, pagination leftovers such as `0 Next`, and duplicated JSON-LD article bodies are now removed before they reach the LLM context.
+- The CLI now recognizes common cookie-consent redirects and prints a clearer warning when a page returns a consent wall instead of usable content.
+- The CLI keeps noisy parser warnings from real-world malformed HTML out of stderr by default. `WEBCLAW_LOG` still lets advanced users opt into deeper parser logs.
+
+Thanks to Nenad Oric (`@devnen`) for the report and patch work in PR #43.
+
+---
+
+## [0.6.1] — 2026-05-12
+
+### Fixed
+- Hardened URL safety across the CLI, MCP server, and self-hosted API paths so local and private network targets are rejected more consistently, including after DNS resolution and redirects.
+- Added a timeout around inline JavaScript data extraction so hostile pages cannot keep the extractor busy forever.
+- Tightened Amazon and eBay URL recognition so deceptive hosts are rejected while common international marketplaces still work.
+- Avoided unnecessary decoding work on large responses during bot-challenge detection.
+- Reduced release workflow token permissions so build jobs run with narrower GitHub access.
+
+---
+
+## [0.6.0] — 2026-05-10
+
+### Fixed
+- Improved `--format llm` output quality on modern news and documentation pages. Framework hydration blobs and low-value page chrome structured-data records are now filtered out before they can flood the LLM context, while content-bearing Schema.org records are preserved. Thanks and congrats to Nenad Oric (`@devnen`) for the contribution in PR #37.
+- Fixed element-to-text spacing so adjacent inline nodes no longer smash words together, while punctuation stays attached on real pages such as docs, forums, and reference sites.
+- Removed common screen-reader-only link chrome such as "opens new tab" from LLM body text and link labels without stripping ordinary prose that happens to mention external links.
+
+---
+
+## [0.5.9] — 2026-05-06
+
+### Fixed
+- LLM providers now support `ANTHROPIC_BASE_URL` for Anthropic-compatible proxies, plus an `OPENAI_RESPONSE_FORMAT_TYPE` override for OpenAI-compatible backends such as LM Studio. Thanks to Toti (`@Toti330`) for the report.
+
+---
+
+## [0.5.8] — 2026-05-04
+
+### Added
+- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. Thanks to Suryansh Mishra (`@notrealsuryansh`) for the contribution.
+
+### Fixed
+- Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise.
+
+### Docs
+- Refreshed the README badges with a cleaner shieldcn style. Thanks to Justin Levine (`@jal-co`) for the contribution, and shout-out to his open-source [shieldcn](https://github.com/jal-co/shieldcn) project.
+
+---
+
+## [0.5.7] — 2026-04-30
+
+### Security
+- Hardened server-side URL fetching against SSRF by rejecting private/internal IP ranges and unsafe redirect targets across CLI, MCP, and the self-hosted REST server. Thanks to KairoKid / dodge1218 (vonbrubeck@gmail.com) for the responsible report.
+
+### Docs
+- README header now uses an `
webclaw
` instead of an `
` slogan. The repo had no heading-level brand anchor before, only a banner image, so search engines indexing the README were missing the canonical brand signal. The new heading is what GitHub renders as the title of the page and what Google co-ranks with webclaw.io.
+
+---
+
+## [0.5.6] — 2026-04-23
+
+### Added
+- `FetchClient::fetch_smart(url)` applies per-site rescue logic and returns the same `FetchResult` shape as `fetch()`. Reddit URLs route to the `.json` API with an identifiable bot `User-Agent`, and Akamai-style challenge pages trigger a homepage cookie warmup plus a retry. Makes `/v1/scrape` on Reddit populate markdown again.
+
+### Fixed
+- Regression introduced in 0.5.4 where the production server's `/v1/scrape` bypassed the Reddit `.json` shortcut and Akamai cookie warmup that `fetch_and_extract` had been providing. Both helpers now live in `fetch_smart` and every caller path picks them up.
+- Panic in the markdown converter (`markdown.rs:925`) on single-pipe `|` lines. A `[1..len-1]` slice on a 1-char input triggered `begin <= end`. Guarded.
+
+---
+
+## [0.5.5] — 2026-04-23
+
+### Added
+- `webclaw --browser safari-ios` on the CLI. Pairs with `--proxy` for DataDome-fronted sites that reject desktop profiles.
+
+---
+
+## [0.5.4] — 2026-04-23
+
+### Added
+- New `BrowserProfile::SafariIos` for Safari iOS 26 fingerprinting. Pairs with a country-matched residential proxy for sites that reject non-mobile profiles.
+- `accept_language_for_url(url)` and `accept_language_for_tld(tld)` helpers. Returns a locale-appropriate `Accept-Language` based on the URL's TLD, with `en-US` as the fallback.
+
+### Changed
+- Chrome browser fingerprint refreshed for current Cloudflare bot management. Fixes 403 challenges on several e-commerce and jobs sites.
+- Bumped `wreq-util` to `3.0.0-rc.10`.
+
+---
+
+## [0.5.2] — 2026-04-22
+
+### Added
+- **`webclaw vertical ` subcommand on the CLI.** Runs a specific vertical extractor and prints typed JSON (pretty-printed by default, `--raw` for single-line). Example: `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/` returns `{post: {title, author, points, ...}, comments: [...]}`. URL-mismatch errors surface cleanly as `"URL '...' does not match the '...' extractor"` on stderr with exit code 1.
+
+- **`webclaw extractors` subcommand on the CLI.** Lists all 28 vertical extractors with name, label, and one URL pattern sample. `--json` emits the full catalog as JSON (same shape as `GET /v1/extractors`) for tooling. Covers discovery for users who don't know which vertical to pick.
+
+- **`vertical_scrape` and `list_extractors` tools on `webclaw-mcp`.** Claude Desktop / Claude Code users can now call any of the 28 extractors by name from an MCP session. Tool count goes from 10 to 12. `list_extractors` takes no args and returns the full catalog; `vertical_scrape` takes `{name, url}` and returns the typed JSON payload. Antibot-gated verticals still auto-escalate to the webclaw cloud API when `WEBCLAW_API_KEY` is set.
+
+### Changed
+- Server-info instruction string in `webclaw-mcp` now lists all 12 tools (previously hard-coded 10). Also `webclaw --help` on the CLI now shows the three subcommands: `bench`, `extractors`, `vertical`.
+
+---
+
## [0.5.1] — 2026-04-22
### Added
diff --git a/CLAUDE.md b/CLAUDE.md
index fcd27da..b30bd84 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -38,6 +38,7 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R
- `filter.rs` — CSS selector include/exclude filtering (ExtractionOptions)
- `diff.rs` — Content change tracking engine (snapshot diffing)
- `brand.rs` — Brand identity extraction from DOM structure and CSS
+- `youtube.rs` — `ytInitialPlayerResponse` parser, structured markdown for `youtube.com/watch` URLs (title, channel, views, published, duration, description). Produces the legacy markdown shape — for transcripts and a structured `YoutubeData` block see the production server's `youtube_transcript.rs` short-circuit (yt-dlp via proxy pool).
### Fetch Modules (`webclaw-fetch`)
- `client.rs` — FetchClient with wreq BoringSSL TLS impersonation; implements the public `Fetcher` trait so callers (including server adapters) can swap in alternative implementations
@@ -79,7 +80,7 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R
- **webclaw-fetch uses wreq 6.x** (BoringSSL). No `[patch.crates-io]` forks needed; wreq handles TLS internally.
- **No special RUSTFLAGS** — `.cargo/config.toml` is currently empty of build flags. Don't add any.
- **webclaw-llm uses plain reqwest**. LLM APIs don't need TLS fingerprinting, so no wreq dep.
-- **Vertical extractors take `&dyn Fetcher`**, not `&FetchClient`. This lets the production server plug in a `TlsSidecarFetcher` that routes through the Go tls-sidecar instead of in-process wreq.
+- **Vertical extractors take `&dyn Fetcher`**, not `&FetchClient`. This lets the production server plug in a `ProductionFetcher` that adds domain_hints routing and antibot escalation on top of the same wreq client.
- **qwen3 thinking tags** (``) are stripped at both provider and consumer levels.
## Build & Test
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3358e48..b046212 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -91,18 +91,16 @@ Body is optional but encouraged for non-trivial changes.
```
webclaw (this repo)
-├── crates/
-│ ├── webclaw-core/ # Pure extraction engine (HTML → markdown/json/text)
-│ ├── webclaw-fetch/ # HTTP client + crawler + sitemap + batch
-│ ├── webclaw-llm/ # LLM provider chain (Ollama → OpenAI → Anthropic)
-│ ├── webclaw-pdf/ # PDF text extraction
-│ ├── webclaw-cli/ # CLI binary
-│ └── webclaw-mcp/ # MCP server binary
-│
-└── [patch.crates-io] # Points to webclaw-tls for TLS fingerprinting
+└── crates/
+ ├── webclaw-core/ # Pure extraction engine (HTML → markdown/json/text)
+ ├── webclaw-fetch/ # HTTP client (wreq/BoringSSL) + crawler + sitemap + batch
+ ├── webclaw-llm/ # LLM provider chain (Ollama → OpenAI → Anthropic)
+ ├── webclaw-pdf/ # PDF text extraction
+ ├── webclaw-cli/ # CLI binary
+ └── webclaw-mcp/ # MCP server binary
```
-TLS fingerprinting lives in a separate repo: [webclaw-tls](https://github.com/0xMassi/webclaw-tls). The `[patch.crates-io]` section in `Cargo.toml` overrides rustls, h2, hyper, hyper-util, and reqwest with our patched forks for browser-grade JA4 + HTTP/2 Akamai fingerprinting.
+TLS fingerprinting is handled in-process by [wreq](https://crates.io/crates/wreq) (BoringSSL), so `webclaw-fetch` impersonates real browser TLS directly. There are no `[patch.crates-io]` forks or external TLS dependencies.
## Crate Boundaries
@@ -111,7 +109,7 @@ Changes that cross crate boundaries need extra care:
| Crate | Network? | Key constraint |
|-------|----------|----------------|
| webclaw-core | No | Zero network deps, WASM-safe |
-| webclaw-fetch | Yes (webclaw-http) | Uses [webclaw-tls](https://github.com/0xMassi/webclaw-tls) for TLS fingerprinting |
+| webclaw-fetch | Yes (wreq) | Browser TLS impersonation via wreq (BoringSSL); no patched deps |
| webclaw-llm | Yes (reqwest) | Plain reqwest — LLM APIs don't need TLS fingerprinting |
| webclaw-pdf | No | Minimal, wraps pdf-extract |
| webclaw-cli | Yes | Depends on all above |
diff --git a/Cargo.lock b/Cargo.lock
index bad52e3..78e7e77 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2967,6 +2967,26 @@ dependencies = [
"pom",
]
+[[package]]
+name = "typed-builder"
+version = "0.23.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31aa81521b70f94402501d848ccc0ecaa8f93c8eb6999eb9747e72287757ffda"
+dependencies = [
+ "typed-builder-macro",
+]
+
+[[package]]
+name = "typed-builder-macro"
+version = "0.23.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "076a02dc54dd46795c2e9c8282ed40bcfb1e22747e955de9389a1de28190fb26"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
[[package]]
name = "typed-path"
version = "0.12.3"
@@ -3199,7 +3219,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
-version = "0.5.1"
+version = "0.6.5"
dependencies = [
"clap",
"dotenvy",
@@ -3220,7 +3240,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
-version = "0.5.1"
+version = "0.6.5"
dependencies = [
"ego-tree",
"once_cell",
@@ -3238,7 +3258,7 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
-version = "0.5.1"
+version = "0.6.5"
dependencies = [
"async-trait",
"bytes",
@@ -3258,12 +3278,13 @@ dependencies = [
"webclaw-core",
"webclaw-pdf",
"wreq",
+ "wreq-util",
"zip 2.4.2",
]
[[package]]
name = "webclaw-llm"
-version = "0.5.1"
+version = "0.6.5"
dependencies = [
"async-trait",
"reqwest",
@@ -3276,7 +3297,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
-version = "0.5.1"
+version = "0.6.5"
dependencies = [
"dirs",
"dotenvy",
@@ -3296,7 +3317,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
-version = "0.5.1"
+version = "0.6.5"
dependencies = [
"pdf-extract",
"thiserror",
@@ -3305,7 +3326,7 @@ dependencies = [
[[package]]
name = "webclaw-server"
-version = "0.5.1"
+version = "0.6.5"
dependencies = [
"anyhow",
"axum",
@@ -3709,6 +3730,16 @@ dependencies = [
"zstd",
]
+[[package]]
+name = "wreq-util"
+version = "3.0.0-rc.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c6bbe24d28beb9ceb58b514bd6a613c759d3b706f768b9d2950d5d35b543c04"
+dependencies = [
+ "typed-builder",
+ "wreq",
+]
+
[[package]]
name = "writeable"
version = "0.6.2"
diff --git a/Cargo.toml b/Cargo.toml
index 92152f2..124c620 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"]
[workspace.package]
-version = "0.5.1"
+version = "0.6.5"
edition = "2024"
license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw"
@@ -21,4 +21,3 @@ tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
clap = { version = "4", features = ["derive", "env"] }
dotenvy = "0.15"
-
diff --git a/Dockerfile b/Dockerfile
index 6f84e06..fefb39b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -59,9 +59,9 @@ RUN touch crates/*/src/*.rs \
# ---------------------------------------------------------------------------
FROM ubuntu:24.04
-RUN apt-get update && apt-get install -y --no-install-recommends \
- ca-certificates \
- && rm -rf /var/lib/apt/lists/*
+# CA bundle from distroless (ships it, multi-arch, gcr.io) instead of
+# apt-installing from ports.ubuntu.com (unreachable for arm64 on CI runners).
+COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
# Copy all three binaries
COPY --from=builder /build/target/release/webclaw /usr/local/bin/webclaw
@@ -73,11 +73,9 @@ COPY --from=builder /build/target/release/webclaw-server /usr/local/bin/webclaw-
# as documentation; callers still need `-p 3000:3000` on `docker run`.
EXPOSE 3000
-# Container default: bind all interfaces so `-p 3000:3000` works. The binary
-# itself defaults to 127.0.0.1 (safe for `cargo run` on a laptop); inside
-# Docker that would make the server unreachable, so we flip it here.
-# Override with -e WEBCLAW_HOST=127.0.0.1 if you front this with another
-# process in the same container.
+# Container default: bind all interfaces so `-p 3000:3000` works. Public
+# binding requires WEBCLAW_API_KEY; the binary refuses open-auth 0.0.0.0
+# unless WEBCLAW_ALLOW_OPEN_PUBLIC=1 is set explicitly for local testing.
ENV WEBCLAW_HOST=0.0.0.0
# Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other
diff --git a/Dockerfile.ci b/Dockerfile.ci
index ccd8a33..7b62718 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -5,9 +5,10 @@ ARG BINARY_DIR=binaries
FROM ubuntu:24.04
-RUN apt-get update && apt-get install -y --no-install-recommends \
- ca-certificates \
- && rm -rf /var/lib/apt/lists/*
+# CA bundle copied from a reliable multi-arch image instead of apt-installing
+# from ports.ubuntu.com — Canonical's arm64 ports mirror is unreachable from
+# CI runners and breaks the multi-arch release build. No build-time network.
+COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
ARG BINARY_DIR
COPY ${BINARY_DIR}/webclaw /usr/local/bin/webclaw
diff --git a/README.md b/README.md
index b752d46..1d0a5ac 100644
--- a/README.md
+++ b/README.md
@@ -1,70 +1,74 @@
+Most web scraping tools give your agent one of two bad outputs:
----
+- a blocked page, login wall, or empty app shell
+- raw HTML full of nav, scripts, styling, ads, and duplicated boilerplate
-Your AI agent calls `fetch()` and gets a 403. Or 142KB of raw HTML that burns through your token budget. **webclaw fixes both.**
+[webclaw.io](https://webclaw.io) is the hosted web extraction API for webclaw. This repo contains the open-source CLI, MCP server, extraction engine, and self-hostable server.
-It extracts clean, structured content from any URL using Chrome-level TLS fingerprinting — no headless browser, no Selenium, no Puppeteer. Output is optimized for LLMs: **67% fewer tokens** than raw HTML, with metadata, links, and images preserved.
+webclaw turns a URL into clean content your tools can actually use.
-```
- Raw HTML webclaw
-┌──────────────────────────────────┐ ┌──────────────────────────────────┐
-│