mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-10 22:45:13 +02:00
Compare commits
12 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
df7336d55b | ||
|
|
acd3021f38 | ||
|
|
bcc58dbadd | ||
|
|
8015de7db5 | ||
|
|
be64409d62 | ||
|
|
2773474984 | ||
|
|
7dfa180e86 | ||
|
|
598f319bf3 | ||
|
|
fae2766db1 | ||
|
|
d0909a25e3 | ||
|
|
499345046c | ||
|
|
d0d7b835f2 |
17 changed files with 272 additions and 101 deletions
BIN
.github/banner.png
vendored
BIN
.github/banner.png
vendored
Binary file not shown.
|
Before Width: | Height: | Size: 44 KiB After Width: | Height: | Size: 48 KiB |
74
.github/workflows/release.yml
vendored
74
.github/workflows/release.yml
vendored
|
|
@ -3,6 +3,15 @@ name: Release
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
tags: ["v*"]
|
tags: ["v*"]
|
||||||
|
# Manual re-publish of the Docker image for an existing release, without
|
||||||
|
# rebuilding binaries or cutting a new version. Runs only the docker (+
|
||||||
|
# homebrew) jobs against the given tag's already-published release assets.
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
tag:
|
||||||
|
description: "Existing release tag to (re)build + push the Docker image for, e.g. v0.6.9"
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
|
@ -12,6 +21,9 @@ env:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
# Binaries are only built when a tag is pushed. A manual dispatch reuses
|
||||||
|
# the existing release's binaries, so it skips this job entirely.
|
||||||
|
if: github.event_name == 'push'
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
name: Build ${{ matrix.target }}
|
name: Build ${{ matrix.target }}
|
||||||
|
|
@ -105,6 +117,7 @@ jobs:
|
||||||
|
|
||||||
release:
|
release:
|
||||||
name: Release
|
name: Release
|
||||||
|
if: github.event_name == 'push'
|
||||||
needs: build
|
needs: build
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
permissions:
|
||||||
|
|
@ -137,6 +150,10 @@ jobs:
|
||||||
docker:
|
docker:
|
||||||
name: Docker
|
name: Docker
|
||||||
needs: release
|
needs: release
|
||||||
|
# Runs after a successful release on tag push, or standalone via
|
||||||
|
# workflow_dispatch to (re)publish an existing tag's image. `always()` lets
|
||||||
|
# it run even though `release` is skipped on a manual dispatch.
|
||||||
|
if: ${{ always() && (github.event_name == 'workflow_dispatch' || needs.release.result == 'success') }}
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
|
@ -156,49 +173,48 @@ jobs:
|
||||||
username: ${{ github.actor }}
|
username: ${{ github.actor }}
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
# Download pre-built binaries for both architectures
|
# The pushed tag, or the workflow_dispatch input for a manual re-publish.
|
||||||
|
- name: Resolve tag
|
||||||
|
id: tag
|
||||||
|
run: echo "tag=${{ github.event.inputs.tag || github.ref_name }}" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
# Download pre-built binaries into TARGETARCH-named dirs (amd64/arm64) so
|
||||||
|
# a single multi-platform build picks the matching binary per platform.
|
||||||
- name: Download release binaries
|
- name: Download release binaries
|
||||||
run: |
|
run: |
|
||||||
tag="${GITHUB_REF#refs/tags/}"
|
tag="${{ steps.tag.outputs.tag }}"
|
||||||
|
declare -A arch=( [x86_64-unknown-linux-gnu]=amd64 [aarch64-unknown-linux-gnu]=arm64 )
|
||||||
for target in x86_64-unknown-linux-gnu aarch64-unknown-linux-gnu; do
|
for target in x86_64-unknown-linux-gnu aarch64-unknown-linux-gnu; do
|
||||||
dir="webclaw-${tag}-${target}"
|
dir="webclaw-${tag}-${target}"
|
||||||
curl -sSL "https://github.com/0xMassi/webclaw/releases/download/${tag}/${dir}.tar.gz" -o "${target}.tar.gz"
|
curl -sSL "https://github.com/0xMassi/webclaw/releases/download/${tag}/${dir}.tar.gz" -o "${target}.tar.gz"
|
||||||
tar xzf "${target}.tar.gz"
|
tar xzf "${target}.tar.gz"
|
||||||
mkdir -p "binaries-${target}"
|
a="${arch[$target]}"
|
||||||
cp "${dir}/webclaw" "binaries-${target}/webclaw"
|
mkdir -p "binaries-${a}"
|
||||||
cp "${dir}/webclaw-mcp" "binaries-${target}/webclaw-mcp"
|
cp "${dir}/webclaw" "${dir}/webclaw-mcp" "${dir}/webclaw-server" "binaries-${a}/"
|
||||||
cp "${dir}/webclaw-server" "binaries-${target}/webclaw-server"
|
chmod +x "binaries-${a}"/*
|
||||||
chmod +x "binaries-${target}"/*
|
|
||||||
done
|
done
|
||||||
ls -laR binaries-*/
|
ls -laR binaries-*/
|
||||||
|
|
||||||
# Build per-arch images with plain docker build (no buildx manifest nesting)
|
# One atomic multi-platform build + push. buildx assembles a single
|
||||||
|
# manifest list and pushes it in one shot, so there is no separate
|
||||||
|
# `imagetools create` step to race GHCR's read-after-write (that is what
|
||||||
|
# failed before: "v0.6.9-arm64: not found"). Provenance/SBOM attestations
|
||||||
|
# are disabled so each platform entry stays a plain image manifest.
|
||||||
- name: Build and push
|
- name: Build and push
|
||||||
run: |
|
run: |
|
||||||
tag="${GITHUB_REF#refs/tags/}"
|
tag="${{ steps.tag.outputs.tag }}"
|
||||||
|
docker buildx build -f Dockerfile.ci \
|
||||||
# amd64
|
--platform linux/amd64,linux/arm64 \
|
||||||
docker build -f Dockerfile.ci --build-arg BINARY_DIR=binaries-x86_64-unknown-linux-gnu \
|
--provenance=false --sbom=false \
|
||||||
--platform linux/amd64 -t ghcr.io/0xmassi/webclaw:${tag}-amd64 --push .
|
-t "ghcr.io/0xmassi/webclaw:${tag}" \
|
||||||
|
-t ghcr.io/0xmassi/webclaw:latest \
|
||||||
# arm64
|
--push .
|
||||||
docker build -f Dockerfile.ci --build-arg BINARY_DIR=binaries-aarch64-unknown-linux-gnu \
|
|
||||||
--platform linux/arm64 -t ghcr.io/0xmassi/webclaw:${tag}-arm64 --push .
|
|
||||||
|
|
||||||
# Multi-arch manifest
|
|
||||||
docker manifest create ghcr.io/0xmassi/webclaw:${tag} \
|
|
||||||
ghcr.io/0xmassi/webclaw:${tag}-amd64 \
|
|
||||||
ghcr.io/0xmassi/webclaw:${tag}-arm64
|
|
||||||
docker manifest push ghcr.io/0xmassi/webclaw:${tag}
|
|
||||||
|
|
||||||
docker manifest create ghcr.io/0xmassi/webclaw:latest \
|
|
||||||
ghcr.io/0xmassi/webclaw:${tag}-amd64 \
|
|
||||||
ghcr.io/0xmassi/webclaw:${tag}-arm64
|
|
||||||
docker manifest push ghcr.io/0xmassi/webclaw:latest
|
|
||||||
|
|
||||||
homebrew:
|
homebrew:
|
||||||
name: Update Homebrew
|
name: Update Homebrew
|
||||||
needs: [release, docker]
|
needs: [release, docker]
|
||||||
|
# Runs once Docker succeeds, on both tag push and manual re-publish.
|
||||||
|
if: ${{ always() && needs.docker.result == 'success' }}
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
|
@ -207,7 +223,7 @@ jobs:
|
||||||
env:
|
env:
|
||||||
COMMITTER_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }}
|
COMMITTER_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
tag="${GITHUB_REF#refs/tags/}"
|
tag="${{ github.event.inputs.tag || github.ref_name }}"
|
||||||
base="https://github.com/0xMassi/webclaw/releases/download/${tag}"
|
base="https://github.com/0xMassi/webclaw/releases/download/${tag}"
|
||||||
|
|
||||||
# Download all tarballs (Linux + macOS) and compute SHAs
|
# Download all tarballs (Linux + macOS) and compute SHAs
|
||||||
|
|
|
||||||
19
CHANGELOG.md
19
CHANGELOG.md
|
|
@ -3,6 +3,25 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.6.9] - 2026-06-10
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- The multi-arch Docker image (linux/amd64 + linux/arm64) now publishes reliably on each release. The build moved to Buildx so registry pushes no longer fail intermittently, and the Homebrew formula update that depends on it is no longer skipped.
|
||||||
|
|
||||||
|
## [0.6.8] - 2026-06-10
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Pages with multibyte text (accented or CJK characters) no longer panic or get mangled during extraction. API-endpoint discovery now cuts oversized scripts on a character boundary instead of crashing mid-character, and structured-data parsing preserves non-ASCII string values instead of turning them into mojibake.
|
||||||
|
- LLM error messages from a provider no longer panic when the error body contains multibyte characters near the truncation point.
|
||||||
|
- LLM provider requests now have explicit connect and overall timeouts, so a stalled or unreachable provider fails fast instead of hanging.
|
||||||
|
- Batch extraction in the MCP server no longer aborts the whole batch when a single URL fails to resolve; bad URLs are reported as individual per-URL errors and the rest still run.
|
||||||
|
- CLI crawl and batch runs now wait for the completion webhook to actually send before exiting, replacing a fixed delay that could cut the request off or waste time.
|
||||||
|
- Homepage warm-up requests now include the port for hosts on a non-default port, so those sites are warmed correctly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.6.7] — 2026-06-09
|
## [0.6.7] — 2026-06-09
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
|
||||||
14
Cargo.lock
generated
14
Cargo.lock
generated
|
|
@ -3221,7 +3221,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.6.7"
|
version = "0.6.9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3242,7 +3242,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.6.7"
|
version = "0.6.9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -3260,7 +3260,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.6.7"
|
version = "0.6.9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes",
|
"bytes",
|
||||||
|
|
@ -3287,7 +3287,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.6.7"
|
version = "0.6.9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3300,7 +3300,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.6.7"
|
version = "0.6.9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dirs",
|
"dirs",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3320,7 +3320,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.6.7"
|
version = "0.6.9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
@ -3329,7 +3329,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-server"
|
name = "webclaw-server"
|
||||||
version = "0.6.7"
|
version = "0.6.9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"axum",
|
"axum",
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
||||||
members = ["crates/*"]
|
members = ["crates/*"]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "0.6.7"
|
version = "0.6.9"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "AGPL-3.0"
|
license = "AGPL-3.0"
|
||||||
repository = "https://github.com/0xMassi/webclaw"
|
repository = "https://github.com/0xMassi/webclaw"
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
# Slim runtime image — uses pre-built binaries from the release.
|
# Slim runtime image — uses pre-built binaries from the release.
|
||||||
# The full Dockerfile (multi-stage Rust build) is for local development.
|
# The full Dockerfile (multi-stage Rust build) is for local development.
|
||||||
# CI uses this to avoid 60+ min QEMU cross-compilation.
|
# CI uses this to avoid 60+ min QEMU cross-compilation.
|
||||||
ARG BINARY_DIR=binaries
|
|
||||||
|
|
||||||
FROM ubuntu:24.04
|
FROM ubuntu:24.04
|
||||||
|
|
||||||
|
|
@ -10,10 +9,13 @@ FROM ubuntu:24.04
|
||||||
# CI runners and breaks the multi-arch release build. No build-time network.
|
# CI runners and breaks the multi-arch release build. No build-time network.
|
||||||
COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
||||||
|
|
||||||
ARG BINARY_DIR
|
# TARGETARCH (amd64 / arm64) is provided automatically by buildx for each
|
||||||
COPY ${BINARY_DIR}/webclaw /usr/local/bin/webclaw
|
# target platform, so one multi-platform build copies the matching binaries.
|
||||||
COPY ${BINARY_DIR}/webclaw-mcp /usr/local/bin/webclaw-mcp
|
# The release workflow stages them in binaries-amd64 / binaries-arm64.
|
||||||
COPY ${BINARY_DIR}/webclaw-server /usr/local/bin/webclaw-server
|
ARG TARGETARCH
|
||||||
|
COPY binaries-${TARGETARCH}/webclaw /usr/local/bin/webclaw
|
||||||
|
COPY binaries-${TARGETARCH}/webclaw-mcp /usr/local/bin/webclaw-mcp
|
||||||
|
COPY binaries-${TARGETARCH}/webclaw-server /usr/local/bin/webclaw-server
|
||||||
|
|
||||||
# Default REST API port when running `webclaw-server` inside the container.
|
# Default REST API port when running `webclaw-server` inside the container.
|
||||||
EXPOSE 3000
|
EXPOSE 3000
|
||||||
|
|
@ -25,8 +27,9 @@ ENV WEBCLAW_HOST=0.0.0.0
|
||||||
|
|
||||||
# Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other
|
# Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other
|
||||||
# commands directly so this image can be used as a FROM base with custom CMD.
|
# commands directly so this image can be used as a FROM base with custom CMD.
|
||||||
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
# `--chmod` sets the bit at copy time so the build needs no in-container `RUN`
|
||||||
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
# (and thus no QEMU emulation for the arm64 platform).
|
||||||
|
COPY --chmod=755 docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
||||||
|
|
||||||
ENTRYPOINT ["docker-entrypoint.sh"]
|
ENTRYPOINT ["docker-entrypoint.sh"]
|
||||||
CMD ["webclaw", "--help"]
|
CMD ["webclaw", "--help"]
|
||||||
|
|
|
||||||
19
README.md
19
README.md
|
|
@ -142,7 +142,7 @@ webclaw https://docs.rust-lang.org --crawl --depth 2 --max-pages 50
|
||||||
- [HTML to Markdown for RAG](examples/html-to-markdown-rag/)
|
- [HTML to Markdown for RAG](examples/html-to-markdown-rag/)
|
||||||
- [Firecrawl-compatible API](examples/firecrawl-compatible-api/)
|
- [Firecrawl-compatible API](examples/firecrawl-compatible-api/)
|
||||||
- [MCP web scraping](examples/mcp-web-scraping/)
|
- [MCP web scraping](examples/mcp-web-scraping/)
|
||||||
- [Proxy-backed crawling](examples/proxy-backed-crawling/)
|
- [Proxy-backed crawling with ColdProxy](examples/proxy-backed-crawling/)
|
||||||
- [Cloudflare diagnostics](examples/cloudflare-diagnostics/)
|
- [Cloudflare diagnostics](examples/cloudflare-diagnostics/)
|
||||||
|
|
||||||
### Extract brand assets
|
### Extract brand assets
|
||||||
|
|
@ -401,6 +401,8 @@ Please remove secrets, cookies, private tokens, and customer data from logs befo
|
||||||
residential IPv6, and datacenter IPv6 proxy infrastructure across 195+ countries for public data
|
residential IPv6, and datacenter IPv6 proxy infrastructure across 195+ countries for public data
|
||||||
collection, regional testing, monitoring, and web scraping workflows. Explore
|
collection, regional testing, monitoring, and web scraping workflows. Explore
|
||||||
<a href="https://coldproxy.com/">ColdProxy</a>'s latest plans and available offers directly on the website.
|
<a href="https://coldproxy.com/">ColdProxy</a>'s latest plans and available offers directly on the website.
|
||||||
|
See the <a href="examples/proxy-backed-crawling/#using-coldproxy">proxy-backed crawling guide</a>
|
||||||
|
for a hands-on walkthrough of wiring ColdProxy into webclaw.
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
</table>
|
</table>
|
||||||
|
|
@ -410,6 +412,21 @@ Please remove secrets, cookies, private tokens, and customer data from logs befo
|
||||||
## Studio Partners
|
## Studio Partners
|
||||||
|
|
||||||
<table>
|
<table>
|
||||||
|
<tr>
|
||||||
|
<td width="340" align="center">
|
||||||
|
<a href="https://go.nodemaven.com/webclaw">
|
||||||
|
<img src="./assets/sponsors/nodemaven-banner.png" alt="NodeMaven" width="300" />
|
||||||
|
</a>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<strong>NodeMaven</strong> is the most reliable proxy provider with the highest-quality IPs on the market.
|
||||||
|
Best solution for automation, web scraping, SEO research, and social media management: 99.9% uptime,
|
||||||
|
sticky sessions up to 7 days, IP filtering (all proxies under a 97% fraud score), no KYC, and cashback up
|
||||||
|
to 10% on traffic. Use <code>WEBCLAW35</code> for 35% off Mobile and Residential proxies, or
|
||||||
|
<code>WEBCLAW40</code> for 40% off ISP (Static) proxies at
|
||||||
|
<a href="https://go.nodemaven.com/webclaw">NodeMaven</a>.
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td width="340" align="center">
|
<td width="340" align="center">
|
||||||
<a href="https://quantumproxies.net/?utm_source=webclaw&utm_medium=github&utm_campaign=sponsor">
|
<a href="https://quantumproxies.net/?utm_source=webclaw&utm_medium=github&utm_campaign=sponsor">
|
||||||
|
|
|
||||||
BIN
assets/sponsors/nodemaven-banner.png
Normal file
BIN
assets/sponsors/nodemaven-banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 302 KiB |
|
|
@ -1548,7 +1548,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
||||||
// Fire webhook on crawl complete
|
// Fire webhook on crawl complete
|
||||||
if let Some(ref webhook_url) = cli.webhook {
|
if let Some(ref webhook_url) = cli.webhook {
|
||||||
let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect();
|
let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect();
|
||||||
fire_webhook(
|
let handle = fire_webhook(
|
||||||
webhook_url,
|
webhook_url,
|
||||||
&serde_json::json!({
|
&serde_json::json!({
|
||||||
"event": "crawl_complete",
|
"event": "crawl_complete",
|
||||||
|
|
@ -1559,8 +1559,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
||||||
"urls": urls,
|
"urls": urls,
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
// Brief pause so the async webhook has time to fire
|
// Wait for the webhook to finish so the process doesn't exit mid-send.
|
||||||
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
|
let _ = handle.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
if result.errors > 0 {
|
if result.errors > 0 {
|
||||||
|
|
@ -1658,7 +1658,7 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
|
||||||
// Fire webhook on batch complete
|
// Fire webhook on batch complete
|
||||||
if let Some(ref webhook_url) = cli.webhook {
|
if let Some(ref webhook_url) = cli.webhook {
|
||||||
let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect();
|
let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect();
|
||||||
fire_webhook(
|
let handle = fire_webhook(
|
||||||
webhook_url,
|
webhook_url,
|
||||||
&serde_json::json!({
|
&serde_json::json!({
|
||||||
"event": "batch_complete",
|
"event": "batch_complete",
|
||||||
|
|
@ -1668,7 +1668,7 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
|
||||||
"urls": urls,
|
"urls": urls,
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
|
let _ = handle.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
if errors > 0 {
|
if errors > 0 {
|
||||||
|
|
@ -1742,9 +1742,12 @@ async fn spawn_on_change(cmd: &str, stdin_payload: &[u8]) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr.
|
/// Fire a webhook POST with a JSON payload. Spawns the send on a background task
|
||||||
/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly.
|
/// and returns its `JoinHandle` so callers that need delivery (e.g. one-shot
|
||||||
fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
/// crawl/batch runs that exit immediately after) can `.await` it; long-running
|
||||||
|
/// loops can drop the handle and let it run fire-and-forget. Errors are logged
|
||||||
|
/// to stderr. Auto-detects Discord and Slack webhook URLs and wraps the payload.
|
||||||
|
fn fire_webhook(url: &str, payload: &serde_json::Value) -> tokio::task::JoinHandle<()> {
|
||||||
let url = url.to_string();
|
let url = url.to_string();
|
||||||
let is_discord = url.contains("discord.com/api/webhooks");
|
let is_discord = url.contains("discord.com/api/webhooks");
|
||||||
let is_slack = url.contains("hooks.slack.com");
|
let is_slack = url.contains("hooks.slack.com");
|
||||||
|
|
@ -1806,7 +1809,7 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
||||||
},
|
},
|
||||||
Err(e) => eprintln!("[webhook] client error: {e}"),
|
Err(e) => eprintln!("[webhook] client error: {e}"),
|
||||||
}
|
}
|
||||||
});
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
|
async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
|
||||||
|
|
@ -2318,7 +2321,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
|
||||||
eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");
|
eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");
|
||||||
|
|
||||||
if let Some(ref webhook_url) = cli.webhook {
|
if let Some(ref webhook_url) = cli.webhook {
|
||||||
fire_webhook(
|
let handle = fire_webhook(
|
||||||
webhook_url,
|
webhook_url,
|
||||||
&serde_json::json!({
|
&serde_json::json!({
|
||||||
"event": "batch_llm_complete",
|
"event": "batch_llm_complete",
|
||||||
|
|
@ -2327,7 +2330,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
|
||||||
"errors": errors,
|
"errors": errors,
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
|
let _ = handle.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
if errors > 0 {
|
if errors > 0 {
|
||||||
|
|
|
||||||
|
|
@ -233,7 +233,13 @@ pub fn extract_endpoints(
|
||||||
}
|
}
|
||||||
let slice = if text.len() > *budget {
|
let slice = if text.len() > *budget {
|
||||||
*truncated = true;
|
*truncated = true;
|
||||||
&text[..*budget]
|
// Snap the cut to a UTF-8 char boundary so non-ASCII content
|
||||||
|
// (multibyte codepoints straddling the budget) can't panic.
|
||||||
|
let mut cut = (*budget).min(text.len());
|
||||||
|
while cut > 0 && !text.is_char_boundary(cut) {
|
||||||
|
cut -= 1;
|
||||||
|
}
|
||||||
|
&text[..cut]
|
||||||
} else {
|
} else {
|
||||||
text
|
text
|
||||||
};
|
};
|
||||||
|
|
@ -512,4 +518,16 @@ mod tests {
|
||||||
);
|
);
|
||||||
assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk"));
|
assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn scan_truncation_at_non_ascii_boundary_does_not_panic() {
|
||||||
|
// A bundle just over the scan budget, padded with a multibyte char
|
||||||
|
// ('é' is 2 bytes) so the cut lands mid-codepoint. The old
|
||||||
|
// `&text[..budget]` slice panicked here; the boundary snap must not.
|
||||||
|
let pad = "é".repeat(MAX_SCAN_BYTES); // ~2× budget in bytes
|
||||||
|
let bundle = format!("{pad} fetch(\"/api/x\")");
|
||||||
|
let bundles = vec![("big.js".to_string(), bundle)];
|
||||||
|
let r = extract_endpoints("<html></html>", "https://example.com/", &bundles);
|
||||||
|
assert!(r.truncated, "oversized bundle should mark truncated");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -178,7 +178,12 @@ pub fn extract_sveltekit(html: &str) -> Vec<Value> {
|
||||||
/// Preserves already-quoted keys and string values.
|
/// Preserves already-quoted keys and string values.
|
||||||
fn js_literal_to_json(input: &str) -> String {
|
fn js_literal_to_json(input: &str) -> String {
|
||||||
let bytes = input.as_bytes();
|
let bytes = input.as_bytes();
|
||||||
let mut out = String::with_capacity(input.len() + input.len() / 10);
|
// Accumulate raw bytes, not `byte as char`. The input is valid UTF-8 and we
|
||||||
|
// only ever copy its bytes verbatim or insert ASCII quotes, so the result is
|
||||||
|
// guaranteed valid UTF-8 — copying byte-by-byte preserves multibyte
|
||||||
|
// codepoints (e.g. accented/CJK string values) instead of mangling them
|
||||||
|
// into Latin-1 mojibake.
|
||||||
|
let mut out: Vec<u8> = Vec::with_capacity(input.len() + input.len() / 10);
|
||||||
let mut i = 0;
|
let mut i = 0;
|
||||||
let len = bytes.len();
|
let len = bytes.len();
|
||||||
|
|
||||||
|
|
@ -187,14 +192,14 @@ fn js_literal_to_json(input: &str) -> String {
|
||||||
|
|
||||||
// Skip through strings
|
// Skip through strings
|
||||||
if b == b'"' {
|
if b == b'"' {
|
||||||
out.push('"');
|
out.push(b'"');
|
||||||
i += 1;
|
i += 1;
|
||||||
while i < len {
|
while i < len {
|
||||||
let c = bytes[i];
|
let c = bytes[i];
|
||||||
out.push(c as char);
|
out.push(c);
|
||||||
i += 1;
|
i += 1;
|
||||||
if c == b'\\' && i < len {
|
if c == b'\\' && i < len {
|
||||||
out.push(bytes[i] as char);
|
out.push(bytes[i]);
|
||||||
i += 1;
|
i += 1;
|
||||||
} else if c == b'"' {
|
} else if c == b'"' {
|
||||||
break;
|
break;
|
||||||
|
|
@ -205,11 +210,11 @@ fn js_literal_to_json(input: &str) -> String {
|
||||||
|
|
||||||
// After { or , — look for unquoted key followed by :
|
// After { or , — look for unquoted key followed by :
|
||||||
if (b == b'{' || b == b',' || b == b'[') && i + 1 < len {
|
if (b == b'{' || b == b',' || b == b'[') && i + 1 < len {
|
||||||
out.push(b as char);
|
out.push(b);
|
||||||
i += 1;
|
i += 1;
|
||||||
// Skip whitespace
|
// Skip whitespace
|
||||||
while i < len && bytes[i].is_ascii_whitespace() {
|
while i < len && bytes[i].is_ascii_whitespace() {
|
||||||
out.push(bytes[i] as char);
|
out.push(bytes[i]);
|
||||||
i += 1;
|
i += 1;
|
||||||
}
|
}
|
||||||
// Check if next is an unquoted identifier (key)
|
// Check if next is an unquoted identifier (key)
|
||||||
|
|
@ -218,29 +223,30 @@ fn js_literal_to_json(input: &str) -> String {
|
||||||
while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
|
while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
|
||||||
i += 1;
|
i += 1;
|
||||||
}
|
}
|
||||||
let key = &input[key_start..i];
|
let key = &bytes[key_start..i];
|
||||||
// Skip whitespace after key
|
// Skip whitespace after key
|
||||||
while i < len && bytes[i].is_ascii_whitespace() {
|
while i < len && bytes[i].is_ascii_whitespace() {
|
||||||
i += 1;
|
i += 1;
|
||||||
}
|
}
|
||||||
// If followed by :, it's an unquoted key — quote it
|
// If followed by :, it's an unquoted key — quote it
|
||||||
if i < len && bytes[i] == b':' {
|
if i < len && bytes[i] == b':' {
|
||||||
out.push('"');
|
out.push(b'"');
|
||||||
out.push_str(key);
|
out.extend_from_slice(key);
|
||||||
out.push('"');
|
out.push(b'"');
|
||||||
} else {
|
} else {
|
||||||
// Not a key — might be a bare value like true/false/null
|
// Not a key — might be a bare value like true/false/null
|
||||||
out.push_str(key);
|
out.extend_from_slice(key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
out.push(b as char);
|
out.push(b);
|
||||||
i += 1;
|
i += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
out
|
// Safe: we only copied bytes from valid-UTF-8 `input` plus ASCII quotes.
|
||||||
|
String::from_utf8(out).unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Replace raw newlines/tabs inside JSON string values with escape sequences.
|
/// Replace raw newlines/tabs inside JSON string values with escape sequences.
|
||||||
|
|
@ -440,4 +446,17 @@ newline"}"#;
|
||||||
assert_eq!(parsed["text"], "line1\nline2");
|
assert_eq!(parsed["text"], "line1\nline2");
|
||||||
assert_eq!(parsed["raw"], "has\nnewline");
|
assert_eq!(parsed["raw"], "has\nnewline");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn js_literal_to_json_preserves_multibyte_utf8() {
|
||||||
|
// Unquoted ASCII keys with accented and CJK string values (the shape
|
||||||
|
// SvelteKit emits). The old `byte as char` path turned the multibyte
|
||||||
|
// values into Latin-1 mojibake; they must now survive intact.
|
||||||
|
let input = r#"{name:"déjà vu", city:"東京", emoji:"🌱"}"#;
|
||||||
|
let json = js_literal_to_json(input);
|
||||||
|
let parsed: Value = serde_json::from_str(&json).unwrap();
|
||||||
|
assert_eq!(parsed["name"], "déjà vu");
|
||||||
|
assert_eq!(parsed["city"], "東京");
|
||||||
|
assert_eq!(parsed["emoji"], "🌱");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -801,11 +801,17 @@ fn is_challenge_html(html: &str) -> bool {
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract the homepage URL (scheme + host) from a full URL.
|
/// Extract the homepage URL (scheme + host[:port]) from a full URL.
|
||||||
fn extract_homepage(url: &str) -> Option<String> {
|
fn extract_homepage(url: &str) -> Option<String> {
|
||||||
url::Url::parse(url)
|
url::Url::parse(url).ok().map(|u| {
|
||||||
.ok()
|
let host = u.host_str().unwrap_or("");
|
||||||
.map(|u| format!("{}://{}/", u.scheme(), u.host_str().unwrap_or("")))
|
// `port()` is `Some` only for a non-default port; include it so a
|
||||||
|
// host like example.com:8443 is warmed on the right port.
|
||||||
|
match u.port() {
|
||||||
|
Some(port) => format!("{}://{}:{}/", u.scheme(), host, port),
|
||||||
|
None => format!("{}://{}/", u.scheme(), host),
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.
|
/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
/// Anthropic provider — Claude models via api.anthropic.com.
|
/// Anthropic provider — Claude models via api.anthropic.com.
|
||||||
/// Anthropic's API differs from OpenAI: system message is a top-level param,
|
/// Anthropic's API differs from OpenAI: system message is a top-level param,
|
||||||
/// not part of the messages array.
|
/// not part of the messages array.
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
|
|
@ -35,7 +37,11 @@ impl AnthropicProvider {
|
||||||
let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?;
|
let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?;
|
||||||
|
|
||||||
Some(Self {
|
Some(Self {
|
||||||
client: reqwest::Client::new(),
|
client: reqwest::Client::builder()
|
||||||
|
.timeout(Duration::from_secs(120))
|
||||||
|
.connect_timeout(Duration::from_secs(10))
|
||||||
|
.build()
|
||||||
|
.unwrap_or_else(|_| reqwest::Client::new()),
|
||||||
key,
|
key,
|
||||||
base_url: base_url
|
base_url: base_url
|
||||||
.or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok())
|
.or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok())
|
||||||
|
|
@ -108,11 +114,7 @@ impl LlmProvider for AnthropicProvider {
|
||||||
if !resp.status().is_success() {
|
if !resp.status().is_success() {
|
||||||
let status = resp.status();
|
let status = resp.status();
|
||||||
let text = resp.text().await.unwrap_or_default();
|
let text = resp.text().await.unwrap_or_default();
|
||||||
let safe_text = if text.len() > 500 {
|
let safe_text = text.chars().take(500).collect::<String>();
|
||||||
&text[..500]
|
|
||||||
} else {
|
|
||||||
&text
|
|
||||||
};
|
|
||||||
return Err(LlmError::ProviderError(format!(
|
return Err(LlmError::ProviderError(format!(
|
||||||
"anthropic returned {status}: {safe_text}"
|
"anthropic returned {status}: {safe_text}"
|
||||||
)));
|
)));
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
/// Ollama provider — talks to a local Ollama instance (default localhost:11434).
|
/// Ollama provider — talks to a local Ollama instance (default localhost:11434).
|
||||||
/// First choice in the provider chain: free, private, fast on Apple Silicon.
|
/// First choice in the provider chain: free, private, fast on Apple Silicon.
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
|
|
@ -24,7 +26,11 @@ impl OllamaProvider {
|
||||||
.unwrap_or_else(|| "qwen3:8b".into());
|
.unwrap_or_else(|| "qwen3:8b".into());
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
client: reqwest::Client::new(),
|
client: reqwest::Client::builder()
|
||||||
|
.timeout(Duration::from_secs(120))
|
||||||
|
.connect_timeout(Duration::from_secs(10))
|
||||||
|
.build()
|
||||||
|
.unwrap_or_else(|_| reqwest::Client::new()),
|
||||||
base_url,
|
base_url,
|
||||||
default_model,
|
default_model,
|
||||||
}
|
}
|
||||||
|
|
@ -70,11 +76,7 @@ impl LlmProvider for OllamaProvider {
|
||||||
if !resp.status().is_success() {
|
if !resp.status().is_success() {
|
||||||
let status = resp.status();
|
let status = resp.status();
|
||||||
let text = resp.text().await.unwrap_or_default();
|
let text = resp.text().await.unwrap_or_default();
|
||||||
let safe_text = if text.len() > 500 {
|
let safe_text = text.chars().take(500).collect::<String>();
|
||||||
&text[..500]
|
|
||||||
} else {
|
|
||||||
&text
|
|
||||||
};
|
|
||||||
return Err(LlmError::ProviderError(format!(
|
return Err(LlmError::ProviderError(format!(
|
||||||
"ollama returned {status}: {safe_text}"
|
"ollama returned {status}: {safe_text}"
|
||||||
)));
|
)));
|
||||||
|
|
@ -98,7 +100,8 @@ impl LlmProvider for OllamaProvider {
|
||||||
|
|
||||||
async fn is_available(&self) -> bool {
|
async fn is_available(&self) -> bool {
|
||||||
let url = format!("{}/api/tags", self.base_url);
|
let url = format!("{}/api/tags", self.base_url);
|
||||||
matches!(self.client.get(&url).send().await, Ok(r) if r.status().is_success())
|
let req = self.client.get(&url).timeout(Duration::from_secs(10));
|
||||||
|
matches!(req.send().await, Ok(r) if r.status().is_success())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn name(&self) -> &str {
|
fn name(&self) -> &str {
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,6 @@
|
||||||
/// OpenAI provider — works with api.openai.com and any OpenAI-compatible endpoint.
|
/// OpenAI provider — works with api.openai.com and any OpenAI-compatible endpoint.
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
|
|
@ -69,7 +71,11 @@ impl OpenAiProvider {
|
||||||
let key = load_api_key(key_override, "OPENAI_API_KEY")?;
|
let key = load_api_key(key_override, "OPENAI_API_KEY")?;
|
||||||
|
|
||||||
Some(Self {
|
Some(Self {
|
||||||
client: reqwest::Client::new(),
|
client: reqwest::Client::builder()
|
||||||
|
.timeout(Duration::from_secs(120))
|
||||||
|
.connect_timeout(Duration::from_secs(10))
|
||||||
|
.build()
|
||||||
|
.unwrap_or_else(|_| reqwest::Client::new()),
|
||||||
key,
|
key,
|
||||||
base_url: base_url
|
base_url: base_url
|
||||||
.or_else(|| std::env::var("OPENAI_BASE_URL").ok())
|
.or_else(|| std::env::var("OPENAI_BASE_URL").ok())
|
||||||
|
|
@ -132,11 +138,7 @@ impl LlmProvider for OpenAiProvider {
|
||||||
if !resp.status().is_success() {
|
if !resp.status().is_success() {
|
||||||
let status = resp.status();
|
let status = resp.status();
|
||||||
let text = resp.text().await.unwrap_or_default();
|
let text = resp.text().await.unwrap_or_default();
|
||||||
let safe_text = if text.len() > 500 {
|
let safe_text = text.chars().take(500).collect::<String>();
|
||||||
&text[..500]
|
|
||||||
} else {
|
|
||||||
&text
|
|
||||||
};
|
|
||||||
return Err(LlmError::ProviderError(format!(
|
return Err(LlmError::ProviderError(format!(
|
||||||
"openai returned {status}: {safe_text}"
|
"openai returned {status}: {safe_text}"
|
||||||
)));
|
)));
|
||||||
|
|
|
||||||
|
|
@ -323,9 +323,10 @@ impl WebclawMcp {
|
||||||
if params.urls.len() > 100 {
|
if params.urls.len() > 100 {
|
||||||
return Err("batch is limited to 100 URLs per request".into());
|
return Err("batch is limited to 100 URLs per request".into());
|
||||||
}
|
}
|
||||||
for u in ¶ms.urls {
|
// No up-front DNS pre-validation: it aborted the whole batch on a
|
||||||
validate_url(u).await?;
|
// single unresolvable URL. The fetch layer applies the same SSRF
|
||||||
}
|
// guard (validate_public_http_url) per URL, so bad entries surface
|
||||||
|
// as individual per-URL errors below instead of failing the batch.
|
||||||
|
|
||||||
let format = params.format.as_deref().unwrap_or("markdown");
|
let format = params.format.as_deref().unwrap_or("markdown");
|
||||||
let concurrency = params.concurrency.unwrap_or(5);
|
let concurrency = params.concurrency.unwrap_or(5);
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,68 @@
|
||||||
# Proxy-Backed Crawling
|
# Proxy-Backed Crawling
|
||||||
|
|
||||||
Use proxy rotation when you need to distribute a crawl across a proxy pool. webclaw supports a single proxy or a proxy file.
|
Use proxy rotation when you need to distribute a crawl across a proxy pool. webclaw supports a single proxy or a proxy file, and accepts any standard HTTP/HTTPS or SOCKS5 proxy URL.
|
||||||
|
|
||||||
|
## Using ColdProxy
|
||||||
|
|
||||||
|
[ColdProxy](https://coldproxy.com/) is webclaw's infrastructure partner, providing residential IPv4, residential IPv6, and datacenter IPv6 proxies across 195+ countries. Use a ColdProxy endpoint as a full URL with `--proxy` / `WEBCLAW_PROXY`, or list several in a `--proxy-file` pool.
|
||||||
|
|
||||||
|
### 1. Get your endpoint
|
||||||
|
|
||||||
|
Sign in to your [ColdProxy dashboard](https://coldproxy.com/) and copy your proxy host, port, and credentials. Assemble them into a standard proxy URL:
|
||||||
|
|
||||||
|
```text
|
||||||
|
http://USERNAME:PASSWORD@HOST:PORT
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. One ColdProxy endpoint
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export WEBCLAW_PROXY="http://USERNAME:PASSWORD@HOST:PORT"
|
||||||
|
webclaw https://example.com --format markdown
|
||||||
|
```
|
||||||
|
|
||||||
|
Or pass it inline:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://example.com \
|
||||||
|
--proxy "http://USERNAME:PASSWORD@HOST:PORT" \
|
||||||
|
--format markdown
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Rotate a ColdProxy pool
|
||||||
|
|
||||||
|
List one ColdProxy endpoint per line in `coldproxy.txt`. Pool files use `host:port:user:pass` (one entry per line; lines starting with `#` are ignored). Mix product types and regions to match your workload:
|
||||||
|
|
||||||
|
```text
|
||||||
|
# residential IPv4
|
||||||
|
HOST:PORT:USERNAME:PASSWORD
|
||||||
|
# residential IPv6
|
||||||
|
HOST:PORT:USERNAME:PASSWORD
|
||||||
|
# datacenter IPv6
|
||||||
|
HOST:PORT:USERNAME:PASSWORD
|
||||||
|
```
|
||||||
|
|
||||||
|
webclaw rotates across the pool per request:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://docs.example.com \
|
||||||
|
--crawl \
|
||||||
|
--depth 2 \
|
||||||
|
--max-pages 200 \
|
||||||
|
--concurrency 10 \
|
||||||
|
--delay 200 \
|
||||||
|
--proxy-file coldproxy.txt \
|
||||||
|
--format markdown
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Target a country
|
||||||
|
|
||||||
|
ColdProxy offers access across 195+ countries. Use the country-specific endpoint from your ColdProxy dashboard for each region you want to collect from (for example, a France residential endpoint for fr-localized pages). Add one endpoint per country to your pool file to spread a single crawl across regions.
|
||||||
|
|
||||||
|
### Choosing a product
|
||||||
|
|
||||||
|
- **Residential IPv4 / IPv6** — highest trust; best for consumer sites, geo-restricted content, and regional QA.
|
||||||
|
- **Datacenter IPv6** — fastest and most cost-effective; best for high-volume crawling of tolerant endpoints.
|
||||||
|
|
||||||
## Single Proxy
|
## Single Proxy
|
||||||
|
|
||||||
|
|
@ -20,12 +82,12 @@ webclaw https://example.com \
|
||||||
|
|
||||||
## Proxy Pool
|
## Proxy Pool
|
||||||
|
|
||||||
Create `proxies.txt` with one proxy per line:
|
Create `proxies.txt` with one proxy per line in `host:port:user:pass` format (lines starting with `#` are ignored):
|
||||||
|
|
||||||
```text
|
```text
|
||||||
http://user:pass@proxy-1.example.com:8080
|
proxy-1.example.com:8080:user:pass
|
||||||
http://user:pass@proxy-2.example.com:8080
|
proxy-2.example.com:8080:user:pass
|
||||||
http://user:pass@proxy-3.example.com:8080
|
proxy-3.example.com:8080:user:pass
|
||||||
```
|
```
|
||||||
|
|
||||||
Run a crawl with controlled concurrency:
|
Run a crawl with controlled concurrency:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue