diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..650984e --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,13 @@ +github: [0xMassi] +patreon: +open_collective: +ko_fi: +tidelift: +community_bridge: +liberapay: +issuehunt: +lfx_crowdfunding: +polar: +buy_me_a_coffee: +thanks_dev: +custom: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0b14bcc..bf03cee 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: name: Test runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - run: cargo test --workspace @@ -23,7 +23,7 @@ jobs: name: Lint runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: dtolnay/rust-toolchain@stable with: components: clippy, rustfmt @@ -31,11 +31,26 @@ jobs: - run: cargo fmt --check --all - run: cargo clippy --all -- -D warnings + wasm: + name: WASM + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + - uses: dtolnay/rust-toolchain@stable + with: + targets: wasm32-unknown-unknown + - uses: Swatinem/rust-cache@v2 + # webclaw-core must stay WASM-safe (zero network deps, no threads). + # Check both with and without default features so the quickjs gate + # can't regress. + - run: cargo check --target wasm32-unknown-unknown -p webclaw-core + - run: cargo check --target wasm32-unknown-unknown -p webclaw-core --no-default-features + docs: name: Docs runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - run: cargo doc --no-deps --workspace diff --git a/.github/workflows/deps.yml b/.github/workflows/deps.yml index 29e851b..7d455cc 100644 --- a/.github/workflows/deps.yml +++ b/.github/workflows/deps.yml @@ -14,7 +14,7 @@ jobs: name: Update webclaw-tls dependencies runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: token: ${{ secrets.SYNC_PAT }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4c4c241..7ad94a3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -5,14 +5,15 @@ on: tags: ["v*"] permissions: - contents: write - packages: write + contents: read env: CARGO_TERM_COLOR: always jobs: build: + permissions: + contents: read name: Build ${{ matrix.target }} runs-on: ${{ matrix.os }} strategy: @@ -27,9 +28,11 @@ jobs: os: ubuntu-latest - target: aarch64-unknown-linux-gnu os: ubuntu-latest + - target: x86_64-pc-windows-msvc + os: windows-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: dtolnay/rust-toolchain@stable with: @@ -57,6 +60,12 @@ jobs: if: matrix.target != 'aarch64-unknown-linux-gnu' && runner.os == 'Linux' run: sudo apt-get update && sudo apt-get install -y cmake + - name: Install NASM (Windows) + if: runner.os == 'Windows' + run: | + choco install nasm -y + echo "C:\Program Files\NASM" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + - name: Build run: cargo build --release --target ${{ matrix.target }} @@ -71,15 +80,25 @@ jobs: # don't repeat that mistake. If a future binary gets renamed or # removed, this step should scream, not quietly publish an # incomplete release. - cp target/${{ matrix.target }}/release/webclaw "$staging/" - cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/" - cp target/${{ matrix.target }}/release/webclaw-server "$staging/" - cp README.md LICENSE "$staging/" - tar czf "$staging.tar.gz" "$staging" - echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV + + if [[ "${{ matrix.os }}" == "windows-latest" ]]; then + cp target/${{ matrix.target }}/release/webclaw.exe "$staging/" + cp target/${{ matrix.target }}/release/webclaw-mcp.exe "$staging/" + cp target/${{ matrix.target }}/release/webclaw-server.exe "$staging/" + cp README.md LICENSE "$staging/" + 7z a -tzip "$staging.zip" "$staging" + echo "ASSET=$staging.zip" >> $GITHUB_ENV + else + cp target/${{ matrix.target }}/release/webclaw "$staging/" + cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/" + cp target/${{ matrix.target }}/release/webclaw-server "$staging/" + cp README.md LICENSE "$staging/" + tar czf "$staging.tar.gz" "$staging" + echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV + fi - name: Upload artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: ${{ matrix.target }} path: ${{ env.ASSET }} @@ -88,10 +107,10 @@ jobs: name: Release needs: build runs-on: ubuntu-latest + permissions: + contents: write steps: - - uses: actions/checkout@v4 - - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v5 with: path: artifacts @@ -99,23 +118,31 @@ jobs: run: | cd artifacts find . -name '*.tar.gz' -exec mv {} . \; - sha256sum *.tar.gz > SHA256SUMS + find . -name '*.zip' -exec mv {} . \; + sha256sum *.tar.gz *.zip > SHA256SUMS 2>/dev/null || sha256sum * > SHA256SUMS cat SHA256SUMS - name: Create GitHub Release - uses: softprops/action-gh-release@v2 - with: - generate_release_notes: true - files: | - artifacts/*.tar.gz - artifacts/SHA256SUMS + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + tag="${GITHUB_REF#refs/tags/}" + gh release create "$tag" \ + artifacts/*.tar.gz \ + artifacts/*.zip \ + artifacts/SHA256SUMS \ + --repo "$GITHUB_REPOSITORY" \ + --generate-notes docker: name: Docker needs: release runs-on: ubuntu-latest + permissions: + contents: read + packages: write steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: docker/setup-qemu-action@v3 with: @@ -173,6 +200,8 @@ jobs: name: Update Homebrew needs: [release, docker] runs-on: ubuntu-latest + permissions: + contents: read steps: - name: Compute all checksums and update formula env: @@ -181,7 +210,7 @@ jobs: tag="${GITHUB_REF#refs/tags/}" base="https://github.com/0xMassi/webclaw/releases/download/${tag}" - # Download all 4 tarballs and compute SHAs + # Download all tarballs (Linux + macOS) and compute SHAs for target in aarch64-apple-darwin x86_64-apple-darwin aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu; do curl -sSL "${base}/webclaw-${tag}-${target}.tar.gz" -o "${target}.tar.gz" done diff --git a/CHANGELOG.md b/CHANGELOG.md index 94b9ddb..856cc11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,99 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.6.5] — 2026-06-04 + +### Changed +- Reddit threads extract reliably again. The old anonymous JSON endpoint is no longer available, so webclaw now reads old.reddit.com directly without an API key or JavaScript. You get the post plus the full nested comment tree, with authors, scores, timestamps, and reply nesting preserved. Comment text keeps its links and code blocks, hidden scores are reported as unknown rather than zero, and deleted comments stay in place so their replies aren't lost. + +--- + +## [0.6.4] — 2026-05-19 + +### Added +- API surface discovery: a new module extracts the API endpoints embedded in a page's inline scripts and linked JavaScript bundles. It surfaces relative REST paths, absolute URLs, GraphQL operations, and WebSocket endpoints that a sitemap alone cannot reveal. A built-in noise filter drops schema.org and json-schema.org references, bare framework paths, and other non-API matches so the result stays focused on the real surface. + +--- + +## [0.6.3] — 2026-05-19 + +### Fixed +- Hardened resource and path-safety limits across the CLI, MCP server, and self-hosted API: oversized or highly compressed responses are capped while streaming, deeply nested page data can no longer exhaust memory, output filenames stay inside the chosen directory, webhook URLs are validated like every other fetch, and multibyte search queries no longer crash slug generation. + +--- + +## [0.6.2] — 2026-05-18 + +### Fixed +- Cleaned up `--format llm` output on noisy news and documentation pages. Comment-count links, bare page-number paragraphs, pagination leftovers such as `0 Next`, and duplicated JSON-LD article bodies are now removed before they reach the LLM context. +- The CLI now recognizes common cookie-consent redirects and prints a clearer warning when a page returns a consent wall instead of usable content. +- The CLI keeps noisy parser warnings from real-world malformed HTML out of stderr by default. `WEBCLAW_LOG` still lets advanced users opt into deeper parser logs. + +Thanks to Nenad Oric (`@devnen`) for the report and patch work in PR #43. + +--- + +## [0.6.1] — 2026-05-12 + +### Fixed +- Hardened URL safety across the CLI, MCP server, and self-hosted API paths so local and private network targets are rejected more consistently, including after DNS resolution and redirects. +- Added a timeout around inline JavaScript data extraction so hostile pages cannot keep the extractor busy forever. +- Tightened Amazon and eBay URL recognition so deceptive hosts are rejected while common international marketplaces still work. +- Avoided unnecessary decoding work on large responses during bot-challenge detection. +- Reduced release workflow token permissions so build jobs run with narrower GitHub access. + +--- + +## [0.6.0] — 2026-05-10 + +### Fixed +- Improved `--format llm` output quality on modern news and documentation pages. Framework hydration blobs and low-value page chrome structured-data records are now filtered out before they can flood the LLM context, while content-bearing Schema.org records are preserved. Thanks and congrats to Nenad Oric (`@devnen`) for the contribution in PR #37. +- Fixed element-to-text spacing so adjacent inline nodes no longer smash words together, while punctuation stays attached on real pages such as docs, forums, and reference sites. +- Removed common screen-reader-only link chrome such as "opens new tab" from LLM body text and link labels without stripping ordinary prose that happens to mention external links. + +--- + +## [0.5.9] — 2026-05-06 + +### Fixed +- LLM providers now support `ANTHROPIC_BASE_URL` for Anthropic-compatible proxies, plus an `OPENAI_RESPONSE_FORMAT_TYPE` override for OpenAI-compatible backends such as LM Studio. Thanks to Toti (`@Toti330`) for the report. + +--- + +## [0.5.8] — 2026-05-04 + +### Added +- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. Thanks to Suryansh Mishra (`@notrealsuryansh`) for the contribution. + +### Fixed +- Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise. + +### Docs +- Refreshed the README badges with a cleaner shieldcn style. Thanks to Justin Levine (`@jal-co`) for the contribution, and shout-out to his open-source [shieldcn](https://github.com/jal-co/shieldcn) project. + +--- + +## [0.5.7] — 2026-04-30 + +### Security +- Hardened server-side URL fetching against SSRF by rejecting private/internal IP ranges and unsafe redirect targets across CLI, MCP, and the self-hosted REST server. Thanks to KairoKid / dodge1218 (vonbrubeck@gmail.com) for the responsible report. + +### Docs +- README header now uses an `
-
-
-
-
+ Turn websites into clean markdown, JSON, and LLM-ready context.
+ CLI, MCP server, REST API, and SDKs for AI agents and RAG pipelines.
+
-
-
- Claude Code's built-in web_fetch → 403 Forbidden. webclaw → clean markdown.
-