diff --git a/.github/banner.png b/.github/banner.png index 968277f..07a6673 100644 Binary files a/.github/banner.png and b/.github/banner.png differ diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bf03cee..0b14bcc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: name: Test runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - run: cargo test --workspace @@ -23,7 +23,7 @@ jobs: name: Lint runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: components: clippy, rustfmt @@ -31,26 +31,11 @@ jobs: - run: cargo fmt --check --all - run: cargo clippy --all -- -D warnings - wasm: - name: WASM - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v5 - - uses: dtolnay/rust-toolchain@stable - with: - targets: wasm32-unknown-unknown - - uses: Swatinem/rust-cache@v2 - # webclaw-core must stay WASM-safe (zero network deps, no threads). - # Check both with and without default features so the quickjs gate - # can't regress. - - run: cargo check --target wasm32-unknown-unknown -p webclaw-core - - run: cargo check --target wasm32-unknown-unknown -p webclaw-core --no-default-features - docs: name: Docs runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - run: cargo doc --no-deps --workspace diff --git a/.github/workflows/deps.yml b/.github/workflows/deps.yml index 7d455cc..29e851b 100644 --- a/.github/workflows/deps.yml +++ b/.github/workflows/deps.yml @@ -14,7 +14,7 @@ jobs: name: Update webclaw-tls dependencies runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 with: token: ${{ secrets.SYNC_PAT }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cd77d01..b2ea54a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -3,29 +3,16 @@ name: Release on: push: tags: ["v*"] - # Manual re-publish of the Docker image for an existing release, without - # rebuilding binaries or cutting a new version. Runs only the docker (+ - # homebrew) jobs against the given tag's already-published release assets. - workflow_dispatch: - inputs: - tag: - description: "Existing release tag to (re)build + push the Docker image for, e.g. v0.6.9" - required: true - type: string permissions: - contents: read + contents: write + packages: write env: CARGO_TERM_COLOR: always jobs: build: - # Binaries are only built when a tag is pushed. A manual dispatch reuses - # the existing release's binaries, so it skips this job entirely. - if: github.event_name == 'push' - permissions: - contents: read name: Build ${{ matrix.target }} runs-on: ${{ matrix.os }} strategy: @@ -44,7 +31,7 @@ jobs: os: windows-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: @@ -110,20 +97,19 @@ jobs: fi - name: Upload artifact - uses: actions/upload-artifact@v5 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.target }} path: ${{ env.ASSET }} release: name: Release - if: github.event_name == 'push' needs: build runs-on: ubuntu-latest - permissions: - contents: write steps: - - uses: actions/download-artifact@v5 + - uses: actions/checkout@v4 + + - uses: actions/download-artifact@v4 with: path: artifacts @@ -136,30 +122,20 @@ jobs: cat SHA256SUMS - name: Create GitHub Release - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - tag="${GITHUB_REF#refs/tags/}" - gh release create "$tag" \ - artifacts/*.tar.gz \ - artifacts/*.zip \ - artifacts/SHA256SUMS \ - --repo "$GITHUB_REPOSITORY" \ - --generate-notes + uses: softprops/action-gh-release@v2 + with: + generate_release_notes: true + files: | + artifacts/*.tar.gz + artifacts/*.zip + artifacts/SHA256SUMS docker: name: Docker needs: release - # Runs after a successful release on tag push, or standalone via - # workflow_dispatch to (re)publish an existing tag's image. `always()` lets - # it run even though `release` is skipped on a manual dispatch. - if: ${{ always() && (github.event_name == 'workflow_dispatch' || needs.release.result == 'success') }} runs-on: ubuntu-latest - permissions: - contents: read - packages: write steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - uses: docker/setup-qemu-action@v3 with: @@ -173,57 +149,56 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - # The pushed tag, or the workflow_dispatch input for a manual re-publish. - - name: Resolve tag - id: tag - run: echo "tag=${{ github.event.inputs.tag || github.ref_name }}" >> "$GITHUB_OUTPUT" - - # Download pre-built binaries into TARGETARCH-named dirs (amd64/arm64) so - # a single multi-platform build picks the matching binary per platform. + # Download pre-built binaries for both architectures - name: Download release binaries run: | - tag="${{ steps.tag.outputs.tag }}" - declare -A arch=( [x86_64-unknown-linux-gnu]=amd64 [aarch64-unknown-linux-gnu]=arm64 ) + tag="${GITHUB_REF#refs/tags/}" for target in x86_64-unknown-linux-gnu aarch64-unknown-linux-gnu; do dir="webclaw-${tag}-${target}" curl -sSL "https://github.com/0xMassi/webclaw/releases/download/${tag}/${dir}.tar.gz" -o "${target}.tar.gz" tar xzf "${target}.tar.gz" - a="${arch[$target]}" - mkdir -p "binaries-${a}" - cp "${dir}/webclaw" "${dir}/webclaw-mcp" "${dir}/webclaw-server" "binaries-${a}/" - chmod +x "binaries-${a}"/* + mkdir -p "binaries-${target}" + cp "${dir}/webclaw" "binaries-${target}/webclaw" + cp "${dir}/webclaw-mcp" "binaries-${target}/webclaw-mcp" + cp "${dir}/webclaw-server" "binaries-${target}/webclaw-server" + chmod +x "binaries-${target}"/* done ls -laR binaries-*/ - # One atomic multi-platform build + push. buildx assembles a single - # manifest list and pushes it in one shot, so there is no separate - # `imagetools create` step to race GHCR's read-after-write (that is what - # failed before: "v0.6.9-arm64: not found"). Provenance/SBOM attestations - # are disabled so each platform entry stays a plain image manifest. + # Build per-arch images with plain docker build (no buildx manifest nesting) - name: Build and push run: | - tag="${{ steps.tag.outputs.tag }}" - docker buildx build -f Dockerfile.ci \ - --platform linux/amd64,linux/arm64 \ - --provenance=false --sbom=false \ - -t "ghcr.io/0xmassi/webclaw:${tag}" \ - -t ghcr.io/0xmassi/webclaw:latest \ - --push . + tag="${GITHUB_REF#refs/tags/}" + + # amd64 + docker build -f Dockerfile.ci --build-arg BINARY_DIR=binaries-x86_64-unknown-linux-gnu \ + --platform linux/amd64 -t ghcr.io/0xmassi/webclaw:${tag}-amd64 --push . + + # arm64 + docker build -f Dockerfile.ci --build-arg BINARY_DIR=binaries-aarch64-unknown-linux-gnu \ + --platform linux/arm64 -t ghcr.io/0xmassi/webclaw:${tag}-arm64 --push . + + # Multi-arch manifest + docker manifest create ghcr.io/0xmassi/webclaw:${tag} \ + ghcr.io/0xmassi/webclaw:${tag}-amd64 \ + ghcr.io/0xmassi/webclaw:${tag}-arm64 + docker manifest push ghcr.io/0xmassi/webclaw:${tag} + + docker manifest create ghcr.io/0xmassi/webclaw:latest \ + ghcr.io/0xmassi/webclaw:${tag}-amd64 \ + ghcr.io/0xmassi/webclaw:${tag}-arm64 + docker manifest push ghcr.io/0xmassi/webclaw:latest homebrew: name: Update Homebrew needs: [release, docker] - # Runs once Docker succeeds, on both tag push and manual re-publish. - if: ${{ always() && needs.docker.result == 'success' }} runs-on: ubuntu-latest - permissions: - contents: read steps: - name: Compute all checksums and update formula env: COMMITTER_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }} run: | - tag="${{ github.event.inputs.tag || github.ref_name }}" + tag="${GITHUB_REF#refs/tags/}" base="https://github.com/0xMassi/webclaw/releases/download/${tag}" # Download all tarballs (Linux + macOS) and compute SHAs diff --git a/CHANGELOG.md b/CHANGELOG.md index cc21d32..7858ae4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,92 +3,6 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). -## [Unreleased] - -## [0.6.9] - 2026-06-10 - -### Fixed -- The multi-arch Docker image (linux/amd64 + linux/arm64) now publishes reliably on each release. The build moved to Buildx so registry pushes no longer fail intermittently, and the Homebrew formula update that depends on it is no longer skipped. - -## [0.6.8] - 2026-06-10 - -### Fixed -- Pages with multibyte text (accented or CJK characters) no longer panic or get mangled during extraction. API-endpoint discovery now cuts oversized scripts on a character boundary instead of crashing mid-character, and structured-data parsing preserves non-ASCII string values instead of turning them into mojibake. -- LLM error messages from a provider no longer panic when the error body contains multibyte characters near the truncation point. -- LLM provider requests now have explicit connect and overall timeouts, so a stalled or unreachable provider fails fast instead of hanging. -- Batch extraction in the MCP server no longer aborts the whole batch when a single URL fails to resolve; bad URLs are reported as individual per-URL errors and the rest still run. -- CLI crawl and batch runs now wait for the completion webhook to actually send before exiting, replacing a fixed delay that could cut the request off or waste time. -- Homepage warm-up requests now include the port for hosts on a non-default port, so those sites are warmed correctly. - ---- - -## [0.6.7] — 2026-06-09 - -### Changed -- Updated the HTTP/TLS engine (wreq 6.0.0-rc.29, wreq-util 3.0.0-rc.12). This pulls in upstream robustness fixes: no more panic on responses with non-UTF8 header values, a fix for short reads when decoding large compressed bodies, and the TCP nodelay setting is restored. Browser TLS fingerprints are unchanged. - ---- - -## [0.6.6] — 2026-06-09 - -### Added -- Slow fetches now print a progress line to stderr every 10 seconds (`# webclaw: still fetching (Ns)`) so a long request no longer looks like the CLI hung. Fast fetches stay silent and stdout is untouched. -- New `--url-encoded` flag plus a warning when a URL looks like the shell split it on `&` or `?`. The warning suggests quoting the URL; pass `--url-encoded` to silence it when the URL is intentional. - ---- - -## [0.6.5] — 2026-06-04 - -### Changed -- Reddit threads extract reliably again. The old anonymous JSON endpoint is no longer available, so webclaw now reads old.reddit.com directly without an API key or JavaScript. You get the post plus the full nested comment tree, with authors, scores, timestamps, and reply nesting preserved. Comment text keeps its links and code blocks, hidden scores are reported as unknown rather than zero, and deleted comments stay in place so their replies aren't lost. - ---- - -## [0.6.4] — 2026-05-19 - -### Added -- API surface discovery: a new module extracts the API endpoints embedded in a page's inline scripts and linked JavaScript bundles. It surfaces relative REST paths, absolute URLs, GraphQL operations, and WebSocket endpoints that a sitemap alone cannot reveal. A built-in noise filter drops schema.org and json-schema.org references, bare framework paths, and other non-API matches so the result stays focused on the real surface. - ---- - -## [0.6.3] — 2026-05-19 - -### Fixed -- Hardened resource and path-safety limits across the CLI, MCP server, and self-hosted API: oversized or highly compressed responses are capped while streaming, deeply nested page data can no longer exhaust memory, output filenames stay inside the chosen directory, webhook URLs are validated like every other fetch, and multibyte search queries no longer crash slug generation. - ---- - -## [0.6.2] — 2026-05-18 - -### Fixed -- Cleaned up `--format llm` output on noisy news and documentation pages. Comment-count links, bare page-number paragraphs, pagination leftovers such as `0 Next`, and duplicated JSON-LD article bodies are now removed before they reach the LLM context. -- The CLI now recognizes common cookie-consent redirects and prints a clearer warning when a page returns a consent wall instead of usable content. -- The CLI keeps noisy parser warnings from real-world malformed HTML out of stderr by default. `WEBCLAW_LOG` still lets advanced users opt into deeper parser logs. - -Thanks to Nenad Oric (`@devnen`) for the report and patch work in PR #43. - ---- - -## [0.6.1] — 2026-05-12 - -### Fixed -- Hardened URL safety across the CLI, MCP server, and self-hosted API paths so local and private network targets are rejected more consistently, including after DNS resolution and redirects. -- Added a timeout around inline JavaScript data extraction so hostile pages cannot keep the extractor busy forever. -- Tightened Amazon and eBay URL recognition so deceptive hosts are rejected while common international marketplaces still work. -- Avoided unnecessary decoding work on large responses during bot-challenge detection. -- Reduced release workflow token permissions so build jobs run with narrower GitHub access. - ---- - -## [0.6.0] — 2026-05-10 - -### Fixed -- Improved `--format llm` output quality on modern news and documentation pages. Framework hydration blobs and low-value page chrome structured-data records are now filtered out before they can flood the LLM context, while content-bearing Schema.org records are preserved. Thanks and congrats to Nenad Oric (`@devnen`) for the contribution in PR #37. -- Fixed element-to-text spacing so adjacent inline nodes no longer smash words together, while punctuation stays attached on real pages such as docs, forums, and reference sites. -- Removed common screen-reader-only link chrome such as "opens new tab" from LLM body text and link labels without stripping ordinary prose that happens to mention external links. - ---- - ## [0.5.9] — 2026-05-06 ### Fixed diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b046212..3358e48 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -91,16 +91,18 @@ Body is optional but encouraged for non-trivial changes. ``` webclaw (this repo) -└── crates/ - ├── webclaw-core/ # Pure extraction engine (HTML → markdown/json/text) - ├── webclaw-fetch/ # HTTP client (wreq/BoringSSL) + crawler + sitemap + batch - ├── webclaw-llm/ # LLM provider chain (Ollama → OpenAI → Anthropic) - ├── webclaw-pdf/ # PDF text extraction - ├── webclaw-cli/ # CLI binary - └── webclaw-mcp/ # MCP server binary +├── crates/ +│ ├── webclaw-core/ # Pure extraction engine (HTML → markdown/json/text) +│ ├── webclaw-fetch/ # HTTP client + crawler + sitemap + batch +│ ├── webclaw-llm/ # LLM provider chain (Ollama → OpenAI → Anthropic) +│ ├── webclaw-pdf/ # PDF text extraction +│ ├── webclaw-cli/ # CLI binary +│ └── webclaw-mcp/ # MCP server binary +│ +└── [patch.crates-io] # Points to webclaw-tls for TLS fingerprinting ``` -TLS fingerprinting is handled in-process by [wreq](https://crates.io/crates/wreq) (BoringSSL), so `webclaw-fetch` impersonates real browser TLS directly. There are no `[patch.crates-io]` forks or external TLS dependencies. +TLS fingerprinting lives in a separate repo: [webclaw-tls](https://github.com/0xMassi/webclaw-tls). The `[patch.crates-io]` section in `Cargo.toml` overrides rustls, h2, hyper, hyper-util, and reqwest with our patched forks for browser-grade JA4 + HTTP/2 Akamai fingerprinting. ## Crate Boundaries @@ -109,7 +111,7 @@ Changes that cross crate boundaries need extra care: | Crate | Network? | Key constraint | |-------|----------|----------------| | webclaw-core | No | Zero network deps, WASM-safe | -| webclaw-fetch | Yes (wreq) | Browser TLS impersonation via wreq (BoringSSL); no patched deps | +| webclaw-fetch | Yes (webclaw-http) | Uses [webclaw-tls](https://github.com/0xMassi/webclaw-tls) for TLS fingerprinting | | webclaw-llm | Yes (reqwest) | Plain reqwest — LLM APIs don't need TLS fingerprinting | | webclaw-pdf | No | Minimal, wraps pdf-extract | | webclaw-cli | Yes | Depends on all above | diff --git a/Cargo.lock b/Cargo.lock index b8a1d23..e49ccc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -28,6 +28,18 @@ dependencies = [ "cpufeatures", ] +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -52,12 +64,6 @@ dependencies = [ "alloc-no-stdlib", ] -[[package]] -name = "allocator-api2" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" - [[package]] name = "android_system_properties" version = "0.1.5" @@ -266,9 +272,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.13.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" [[package]] name = "block-buffer" @@ -279,6 +285,31 @@ dependencies = [ "generic-array", ] +[[package]] +name = "boring-sys2" +version = "5.0.0-alpha.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "455d79965f5155dcc88a7abce112c3590883889131b799beda10bf9a813ed669" +dependencies = [ + "bindgen", + "cmake", + "fs_extra", + "fslock", +] + +[[package]] +name = "boring2" +version = "5.0.0-alpha.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183ccc3854411c035410dcdbffafca62084f3a6c33f013c77e83c025d2a08a28" +dependencies = [ + "bitflags", + "boring-sys2", + "foreign-types", + "libc", + "openssl-macros", +] + [[package]] name = "brotli" version = "8.0.2" @@ -300,31 +331,6 @@ dependencies = [ "alloc-stdlib", ] -[[package]] -name = "btls" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c5e60b8c8d282c86360cab651ded04ab0335a7b5390c8d34145cbeab8cacf5f" -dependencies = [ - "bitflags", - "btls-sys", - "foreign-types", - "libc", - "openssl-macros", -] - -[[package]] -name = "btls-sys" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b1b8638a2e1c38a5ae4efa90ae57e643baec35a30d03fc5b399b893adc4954b" -dependencies = [ - "bindgen", - "cmake", - "fs_extra", - "fslock", -] - [[package]] name = "bumpalo" version = "3.20.2" @@ -859,12 +865,6 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" -[[package]] -name = "foldhash" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" - [[package]] name = "foreign-types" version = "0.5.0" @@ -1089,13 +1089,19 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "hashbrown" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" + [[package]] name = "hashbrown" version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "foldhash 0.1.5", + "foldhash", ] [[package]] @@ -1104,17 +1110,6 @@ version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" -[[package]] -name = "hashbrown" -version = "0.17.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" -dependencies = [ - "allocator-api2", - "equivalent", - "foldhash 0.2.0", -] - [[package]] name = "heck" version = "0.5.0" @@ -1177,9 +1172,9 @@ dependencies = [ [[package]] name = "http2" -version = "0.5.17" +version = "0.5.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "569ef7a780e853c4e1768f58a3c8168193b82cdcbab66638a0b1c6583ec5995e" +checksum = "c45c6490693ee8a8d0d95fdbdf76fead9fb87548f7894137259a7c6d22821948" dependencies = [ "atomic-waker", "bytes", @@ -1188,6 +1183,7 @@ dependencies = [ "futures-sink", "http", "indexmap", + "parking_lot", "slab", "smallvec", "tokio", @@ -1499,9 +1495,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "libc" -version = "0.2.186" +version = "0.2.183" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" [[package]] name = "libloading" @@ -1567,15 +1563,6 @@ dependencies = [ "weezl", ] -[[package]] -name = "lru" -version = "0.18.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a860605968fce16869fd239cf4237a82f3ac470723415db603b0e8b6c8d4fb9" -dependencies = [ - "hashbrown 0.17.1", -] - [[package]] name = "lru-slab" version = "0.1.2" @@ -2388,6 +2375,17 @@ dependencies = [ "syn", ] +[[package]] +name = "schnellru" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "356285bbf17bea63d9e52e96bd18f039672ac92b55b8cb997d6162a2a37d1649" +dependencies = [ + "ahash", + "cfg-if", + "hashbrown 0.13.2", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -2781,9 +2779,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.52.3" +version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" +checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" dependencies = [ "bytes", "libc", @@ -2797,20 +2795,20 @@ dependencies = [ ] [[package]] -name = "tokio-btls" -version = "0.5.6" +name = "tokio-boring2" +version = "5.0.0-alpha.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e1fd638ec35427faf3b8f412e0fdd6fae76591d79dba40f38fa667d22bc44dd" +checksum = "0f81df1210d791f31d72d840de8fbd80b9c3cb324956523048b1413e2bd55756" dependencies = [ - "btls", + "boring2", "tokio", ] [[package]] name = "tokio-macros" -version = "2.7.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" +checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" dependencies = [ "proc-macro2", "quote", @@ -3221,7 +3219,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.6.9" +version = "0.5.9" dependencies = [ "clap", "dotenvy", @@ -3242,7 +3240,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.6.9" +version = "0.5.9" dependencies = [ "ego-tree", "once_cell", @@ -3260,12 +3258,11 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.6.9" +version = "0.5.9" dependencies = [ "async-trait", "bytes", "calamine", - "futures-util", "http", "quick-xml 0.37.5", "rand 0.8.5", @@ -3287,7 +3284,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.6.9" +version = "0.5.9" dependencies = [ "async-trait", "reqwest", @@ -3300,7 +3297,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.6.9" +version = "0.5.9" dependencies = [ "dirs", "dotenvy", @@ -3320,7 +3317,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.6.9" +version = "0.5.9" dependencies = [ "pdf-extract", "thiserror", @@ -3329,7 +3326,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.6.9" +version = "0.5.9" dependencies = [ "anyhow", "axum", @@ -3350,9 +3347,9 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.7" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" +checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" dependencies = [ "rustls-pki-types", ] @@ -3699,14 +3696,17 @@ dependencies = [ [[package]] name = "wreq" -version = "6.0.0-rc.29" +version = "6.0.0-rc.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f0eba5f5814a94e5f1a99156f187133464e525b66bdbc69a9627d46530af2e1" +checksum = "f79937f6c4df65b3f6f78715b9de2977afe9ee3b3436483c7949a24511e25935" dependencies = [ - "btls", - "btls-sys", + "ahash", + "boring2", + "brotli", "bytes", "cookie", + "flate2", + "futures-channel", "futures-util", "http", "http-body", @@ -3715,64 +3715,29 @@ dependencies = [ "httparse", "ipnet", "libc", - "lru", "percent-encoding", "pin-project-lite", + "schnellru", + "smallvec", "socket2", - "sync_wrapper", "tokio", - "tokio-btls", - "tokio-util", + "tokio-boring2", "tower", "tower-http", "url", - "webpki-root-certs", - "wreq-proto", - "wreq-rt", -] - -[[package]] -name = "wreq-proto" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a43942f024bb303f1042c9aa3c87fa1d9149f507c65db6e5220a11ccdb207387" -dependencies = [ - "bytes", - "futures-channel", - "futures-util", - "http", - "http-body", - "http2", - "httparse", - "pin-project-lite", - "smallvec", - "tokio", - "tokio-util", "want", -] - -[[package]] -name = "wreq-rt" -version = "0.2.2-rc.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99e9bce67a3fa3dd3f1503f066d86661c9caf399a763d3bd184da7afaf886c8b" -dependencies = [ - "pin-project-lite", - "tokio", - "wreq-proto", + "webpki-root-certs", + "zstd", ] [[package]] name = "wreq-util" -version = "3.0.0-rc.12" +version = "3.0.0-rc.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baa5d2ab72139256916ca352a3d05c53d74e1dd360052eb5ba7691033c417c65" +checksum = "6c6bbe24d28beb9ceb58b514bd6a613c759d3b706f768b9d2950d5d35b543c04" dependencies = [ - "brotli", - "flate2", "typed-builder", "wreq", - "zstd", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index fc3a2c5..12a4b73 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.6.9" +version = "0.5.9" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/Dockerfile b/Dockerfile index fefb39b..552aea7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -59,9 +59,9 @@ RUN touch crates/*/src/*.rs \ # --------------------------------------------------------------------------- FROM ubuntu:24.04 -# CA bundle from distroless (ships it, multi-arch, gcr.io) instead of -# apt-installing from ports.ubuntu.com (unreachable for arm64 on CI runners). -COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* # Copy all three binaries COPY --from=builder /build/target/release/webclaw /usr/local/bin/webclaw diff --git a/Dockerfile.ci b/Dockerfile.ci index 740855d..ccd8a33 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -1,21 +1,18 @@ # Slim runtime image — uses pre-built binaries from the release. # The full Dockerfile (multi-stage Rust build) is for local development. # CI uses this to avoid 60+ min QEMU cross-compilation. +ARG BINARY_DIR=binaries FROM ubuntu:24.04 -# CA bundle copied from a reliable multi-arch image instead of apt-installing -# from ports.ubuntu.com — Canonical's arm64 ports mirror is unreachable from -# CI runners and breaks the multi-arch release build. No build-time network. -COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* -# TARGETARCH (amd64 / arm64) is provided automatically by buildx for each -# target platform, so one multi-platform build copies the matching binaries. -# The release workflow stages them in binaries-amd64 / binaries-arm64. -ARG TARGETARCH -COPY binaries-${TARGETARCH}/webclaw /usr/local/bin/webclaw -COPY binaries-${TARGETARCH}/webclaw-mcp /usr/local/bin/webclaw-mcp -COPY binaries-${TARGETARCH}/webclaw-server /usr/local/bin/webclaw-server +ARG BINARY_DIR +COPY ${BINARY_DIR}/webclaw /usr/local/bin/webclaw +COPY ${BINARY_DIR}/webclaw-mcp /usr/local/bin/webclaw-mcp +COPY ${BINARY_DIR}/webclaw-server /usr/local/bin/webclaw-server # Default REST API port when running `webclaw-server` inside the container. EXPOSE 3000 @@ -27,9 +24,8 @@ ENV WEBCLAW_HOST=0.0.0.0 # Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other # commands directly so this image can be used as a FROM base with custom CMD. -# `--chmod` sets the bit at copy time so the build needs no in-container `RUN` -# (and thus no QEMU emulation for the arm64 platform). -COPY --chmod=755 docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh +COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh +RUN chmod +x /usr/local/bin/docker-entrypoint.sh ENTRYPOINT ["docker-entrypoint.sh"] CMD ["webclaw", "--help"] diff --git a/README.md b/README.md index d44e659..79758f0 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@

- webclaw + webclaw

webclaw

- Turn websites into clean markdown, JSON, and LLM-ready context.
- CLI, MCP server, REST API, and SDKs for AI agents and RAG pipelines. + The fastest web scraper for AI agents.
+ 67% fewer tokens. Sub-millisecond extraction. Zero browser overhead.

@@ -17,58 +17,64 @@ License npm installs

-

Discord X / Twitter - Hosted webclaw + Website Docs

+--- +

- webclaw extracting clean markdown from a page + Claude Code: web_fetch gets 403, webclaw extracts successfully +
+ Claude Code's built-in web_fetch → 403 Forbidden. webclaw → clean markdown.

--- -Most web scraping tools give your agent one of two bad outputs: +Your AI agent calls `fetch()` and gets a 403. Or 142KB of raw HTML that burns through your token budget. **webclaw fixes both.** -- a blocked page, login wall, or empty app shell -- raw HTML full of nav, scripts, styling, ads, and duplicated boilerplate +It extracts clean, structured content from any URL using Chrome-level TLS fingerprinting — no headless browser, no Selenium, no Puppeteer. Output is optimized for LLMs: **67% fewer tokens** than raw HTML, with metadata, links, and images preserved. -[webclaw.io](https://webclaw.io) is the hosted web extraction API for webclaw. This repo contains the open-source CLI, MCP server, extraction engine, and self-hostable server. - -webclaw turns a URL into clean content your tools can actually use. - -```bash -webclaw https://example.com --format markdown ``` - -```md -# Example Domain - -This domain is for use in illustrative examples in documents. - -You may use this domain in literature without prior coordination or asking for permission. + Raw HTML webclaw +┌──────────────────────────────────┐ ┌──────────────────────────────────┐ +│
│ │ # Breaking: AI Breakthrough │ +│