diff --git a/.github/banner.png b/.github/banner.png index 07a6673..968277f 100644 Binary files a/.github/banner.png and b/.github/banner.png differ diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7ad94a3..cd77d01 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -3,6 +3,15 @@ name: Release on: push: tags: ["v*"] + # Manual re-publish of the Docker image for an existing release, without + # rebuilding binaries or cutting a new version. Runs only the docker (+ + # homebrew) jobs against the given tag's already-published release assets. + workflow_dispatch: + inputs: + tag: + description: "Existing release tag to (re)build + push the Docker image for, e.g. v0.6.9" + required: true + type: string permissions: contents: read @@ -12,6 +21,9 @@ env: jobs: build: + # Binaries are only built when a tag is pushed. A manual dispatch reuses + # the existing release's binaries, so it skips this job entirely. + if: github.event_name == 'push' permissions: contents: read name: Build ${{ matrix.target }} @@ -105,6 +117,7 @@ jobs: release: name: Release + if: github.event_name == 'push' needs: build runs-on: ubuntu-latest permissions: @@ -137,6 +150,10 @@ jobs: docker: name: Docker needs: release + # Runs after a successful release on tag push, or standalone via + # workflow_dispatch to (re)publish an existing tag's image. `always()` lets + # it run even though `release` is skipped on a manual dispatch. + if: ${{ always() && (github.event_name == 'workflow_dispatch' || needs.release.result == 'success') }} runs-on: ubuntu-latest permissions: contents: read @@ -156,49 +173,48 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - # Download pre-built binaries for both architectures + # The pushed tag, or the workflow_dispatch input for a manual re-publish. + - name: Resolve tag + id: tag + run: echo "tag=${{ github.event.inputs.tag || github.ref_name }}" >> "$GITHUB_OUTPUT" + + # Download pre-built binaries into TARGETARCH-named dirs (amd64/arm64) so + # a single multi-platform build picks the matching binary per platform. - name: Download release binaries run: | - tag="${GITHUB_REF#refs/tags/}" + tag="${{ steps.tag.outputs.tag }}" + declare -A arch=( [x86_64-unknown-linux-gnu]=amd64 [aarch64-unknown-linux-gnu]=arm64 ) for target in x86_64-unknown-linux-gnu aarch64-unknown-linux-gnu; do dir="webclaw-${tag}-${target}" curl -sSL "https://github.com/0xMassi/webclaw/releases/download/${tag}/${dir}.tar.gz" -o "${target}.tar.gz" tar xzf "${target}.tar.gz" - mkdir -p "binaries-${target}" - cp "${dir}/webclaw" "binaries-${target}/webclaw" - cp "${dir}/webclaw-mcp" "binaries-${target}/webclaw-mcp" - cp "${dir}/webclaw-server" "binaries-${target}/webclaw-server" - chmod +x "binaries-${target}"/* + a="${arch[$target]}" + mkdir -p "binaries-${a}" + cp "${dir}/webclaw" "${dir}/webclaw-mcp" "${dir}/webclaw-server" "binaries-${a}/" + chmod +x "binaries-${a}"/* done ls -laR binaries-*/ - # Build per-arch images with plain docker build (no buildx manifest nesting) + # One atomic multi-platform build + push. buildx assembles a single + # manifest list and pushes it in one shot, so there is no separate + # `imagetools create` step to race GHCR's read-after-write (that is what + # failed before: "v0.6.9-arm64: not found"). Provenance/SBOM attestations + # are disabled so each platform entry stays a plain image manifest. - name: Build and push run: | - tag="${GITHUB_REF#refs/tags/}" - - # amd64 - docker build -f Dockerfile.ci --build-arg BINARY_DIR=binaries-x86_64-unknown-linux-gnu \ - --platform linux/amd64 -t ghcr.io/0xmassi/webclaw:${tag}-amd64 --push . - - # arm64 - docker build -f Dockerfile.ci --build-arg BINARY_DIR=binaries-aarch64-unknown-linux-gnu \ - --platform linux/arm64 -t ghcr.io/0xmassi/webclaw:${tag}-arm64 --push . - - # Multi-arch manifest - docker manifest create ghcr.io/0xmassi/webclaw:${tag} \ - ghcr.io/0xmassi/webclaw:${tag}-amd64 \ - ghcr.io/0xmassi/webclaw:${tag}-arm64 - docker manifest push ghcr.io/0xmassi/webclaw:${tag} - - docker manifest create ghcr.io/0xmassi/webclaw:latest \ - ghcr.io/0xmassi/webclaw:${tag}-amd64 \ - ghcr.io/0xmassi/webclaw:${tag}-arm64 - docker manifest push ghcr.io/0xmassi/webclaw:latest + tag="${{ steps.tag.outputs.tag }}" + docker buildx build -f Dockerfile.ci \ + --platform linux/amd64,linux/arm64 \ + --provenance=false --sbom=false \ + -t "ghcr.io/0xmassi/webclaw:${tag}" \ + -t ghcr.io/0xmassi/webclaw:latest \ + --push . homebrew: name: Update Homebrew needs: [release, docker] + # Runs once Docker succeeds, on both tag push and manual re-publish. + if: ${{ always() && needs.docker.result == 'success' }} runs-on: ubuntu-latest permissions: contents: read @@ -207,7 +223,7 @@ jobs: env: COMMITTER_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }} run: | - tag="${GITHUB_REF#refs/tags/}" + tag="${{ github.event.inputs.tag || github.ref_name }}" base="https://github.com/0xMassi/webclaw/releases/download/${tag}" # Download all tarballs (Linux + macOS) and compute SHAs diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 0000000..fabcd4f --- /dev/null +++ b/.mcp.json @@ -0,0 +1,7 @@ +{ + "mcpServers": { + "webclaw": { + "command": "~/.webclaw/webclaw-mcp" + } + } +} diff --git a/CHANGELOG.md b/CHANGELOG.md index fd27092..f2bda69 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,59 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [Unreleased] + +## [0.6.13] - 2026-06-17 + +### Performance +- Faster content extraction with byte-identical output. The markdown noise filter no longer recompiles its CSS selectors on every element; the vertical extractors share a single Open Graph meta parse instead of re-scanning the page per field; the JavaScript sandbox is skipped entirely when a page has no JS-assigned data (and reuses the already-parsed document instead of re-parsing); and the HTTP client now tunes its connection pool (connect timeout, idle-pool reuse, keep-alive) for better connection reuse across requests. + +## [0.6.12] - 2026-06-17 + +### Added +- **Standalone web search** using your own [Serper.dev](https://serper.dev) key — no hosted webclaw account needed. Available across the CLI (`webclaw search "query" --num 5 --scrape`, key via `--serper-key` or `SERPER_API_KEY`), the MCP `search` tool (local-first when `SERPER_API_KEY` is set, hosted API otherwise), and the self-hosted REST server (`POST /v1/search`, enabled when started with `SERPER_API_KEY`). With `--scrape`, the top result pages are fetched and extracted to markdown. +- **Layered URL discovery for `--map`**: when a site has no sitemap or only a thin one, map now falls back to a bounded same-origin crawl and harvests links from every fetched page plus the unfetched frontier, returning far more URLs. Adds gzipped-sitemap (`.xml.gz`) support, deeper sitemap-index recursion, more fallback paths, and `--map-pages` / `--no-map-crawl` / `--map-limit` controls. Crawler logs now go to stderr so `--map --format json` stays machine-parseable. + +### Fixed +- MCP tools now accept boolean arguments whether the client sends them as JSON booleans or as the strings `"true"`/`"false"` (case-insensitive). Some MCP clients (e.g. Claude Desktop) send booleans as strings, which previously failed the call with a deserialization error. Affects `scrape` (only_main_content), `crawl` (use_sitemap), `research` (deep), and `search` (scrape). This completes the earlier numeric-parameter fix. + +## [0.6.11] - 2026-06-16 + +### Added +- New **Google Gemini** provider in the LLM provider chain. Set `GEMINI_API_KEY` (and optionally `GEMINI_MODEL`, default `gemini-2.5-flash`) to enable it; the chain tries Ollama → OpenAI → Gemini → Anthropic and uses the first available provider. + +### Fixed +- The Anthropic provider's default model pointed at a retired model id that now returns `404`, which could fail extraction/summarization when falling back to Anthropic. It now defaults to a current model and is overridable via `ANTHROPIC_MODEL`. + +## [0.6.10] - 2026-06-15 + +### Fixed +- MCP tools that take numeric arguments now accept those values whether the client sends them as numbers or as numeric strings. Some MCP clients (e.g. Claude Desktop) send `"5"` instead of `5`, which previously failed the call with a deserialization error. Affects `crawl` (depth, max_pages, concurrency), `batch` (concurrency), `search` (num_results), and `summarize` (max_sentences). + +## [0.6.9] - 2026-06-10 + +### Fixed +- The multi-arch Docker image (linux/amd64 + linux/arm64) now publishes reliably on each release. The build moved to Buildx so registry pushes no longer fail intermittently, and the Homebrew formula update that depends on it is no longer skipped. + +## [0.6.8] - 2026-06-10 + +### Fixed +- Pages with multibyte text (accented or CJK characters) no longer panic or get mangled during extraction. API-endpoint discovery now cuts oversized scripts on a character boundary instead of crashing mid-character, and structured-data parsing preserves non-ASCII string values instead of turning them into mojibake. +- LLM error messages from a provider no longer panic when the error body contains multibyte characters near the truncation point. +- LLM provider requests now have explicit connect and overall timeouts, so a stalled or unreachable provider fails fast instead of hanging. +- Batch extraction in the MCP server no longer aborts the whole batch when a single URL fails to resolve; bad URLs are reported as individual per-URL errors and the rest still run. +- CLI crawl and batch runs now wait for the completion webhook to actually send before exiting, replacing a fixed delay that could cut the request off or waste time. +- Homepage warm-up requests now include the port for hosts on a non-default port, so those sites are warmed correctly. + +--- + +## [0.6.7] — 2026-06-09 + +### Changed +- Updated the HTTP/TLS engine (wreq 6.0.0-rc.29, wreq-util 3.0.0-rc.12). This pulls in upstream robustness fixes: no more panic on responses with non-UTF8 header values, a fix for short reads when decoding large compressed bodies, and the TCP nodelay setting is restored. Browser TLS fingerprints are unchanged. + +--- + ## [0.6.6] — 2026-06-09 ### Added diff --git a/CLAUDE.md b/CLAUDE.md index b30bd84..387c2dd 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -15,6 +15,7 @@ webclaw/ # + proxy pool rotation (per-request) # + PDF content-type detection # + document parsing (DOCX, XLSX, CSV) + # + layered URL discovery (map) + Serper web search (BYO key) webclaw-llm/ # LLM provider chain (Ollama -> OpenAI -> Anthropic) # + JSON schema extraction, prompt extraction, summarization webclaw-pdf/ # PDF text extraction via pdf-extract @@ -30,25 +31,34 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R - `extractor.rs` — Readability-style scoring: text density, semantic tags, link density penalty - `noise.rs` — Shared noise filter: tags, ARIA roles, class/ID patterns. Tailwind-safe. - `data_island.rs` — JSON data island extraction for React SPAs, Next.js, Contentful CMS +- `structured_data.rs` — JSON-LD, Next.js `__NEXT_DATA__`, and SvelteKit data-island extraction +- `js_eval.rs` — QuickJS sandbox (rquickjs) that runs inline `