mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
Compare commits
59 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a1abf625a0 | ||
|
|
9a63c1a3ca | ||
|
|
58d274ffe9 | ||
|
|
f6000cba52 | ||
|
|
217bfe088b | ||
|
|
3b7d11328e | ||
|
|
363e17d362 | ||
|
|
8fe8bcb479 | ||
|
|
51260ae4e3 | ||
|
|
fe567a6af1 | ||
|
|
be8bcfebd9 | ||
|
|
aab51bea91 | ||
|
|
b75b768ec3 | ||
|
|
3fabdc1d02 | ||
|
|
5eef8358b0 | ||
|
|
7dfd62ec1d | ||
|
|
6d886c44f6 | ||
|
|
8e3ad17428 | ||
|
|
7321549412 | ||
|
|
72edb61881 | ||
|
|
00d86a12bc | ||
|
|
c8be5214f6 | ||
|
|
0ea189c5b2 | ||
|
|
a629534490 | ||
|
|
fd2e75d509 | ||
|
|
e2f89941ac | ||
|
|
307b4f980d | ||
|
|
dbf9ce08a6 | ||
|
|
3bcb288d13 | ||
|
|
a611ae26f3 | ||
|
|
af96628dc9 | ||
|
|
e8ca1417d6 | ||
|
|
7f75143954 | ||
|
|
e6a95f783d | ||
|
|
a3aa4bce6f | ||
|
|
86183b11e4 | ||
|
|
513b0e493e | ||
|
|
a1242a1c1d | ||
|
|
a542e45768 | ||
|
|
615f326660 | ||
|
|
72b8dbc285 | ||
|
|
1c9def2fde | ||
|
|
eede2f6953 | ||
|
|
bdf81fe6bf | ||
|
|
23544f8fac | ||
|
|
923445f4a8 | ||
|
|
0e6c7cdc97 | ||
|
|
5795c5c422 | ||
|
|
4908367720 | ||
|
|
a5c3433372 | ||
|
|
966981bc42 | ||
|
|
866fa88aa0 | ||
|
|
b413d702b2 | ||
|
|
98a177dec4 | ||
|
|
e1af2da509 | ||
|
|
2285c585b1 | ||
|
|
b77767814a | ||
|
|
4bf11d902f | ||
|
|
0daa2fec1a |
71 changed files with 6443 additions and 925 deletions
13
.github/FUNDING.yml
vendored
Normal file
13
.github/FUNDING.yml
vendored
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
github: [0xMassi]
|
||||||
|
patreon:
|
||||||
|
open_collective:
|
||||||
|
ko_fi:
|
||||||
|
tidelift:
|
||||||
|
community_bridge:
|
||||||
|
liberapay:
|
||||||
|
issuehunt:
|
||||||
|
lfx_crowdfunding:
|
||||||
|
polar:
|
||||||
|
buy_me_a_coffee:
|
||||||
|
thanks_dev:
|
||||||
|
custom:
|
||||||
21
.github/workflows/ci.yml
vendored
21
.github/workflows/ci.yml
vendored
|
|
@ -14,7 +14,7 @@ jobs:
|
||||||
name: Test
|
name: Test
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v5
|
||||||
- uses: dtolnay/rust-toolchain@stable
|
- uses: dtolnay/rust-toolchain@stable
|
||||||
- uses: Swatinem/rust-cache@v2
|
- uses: Swatinem/rust-cache@v2
|
||||||
- run: cargo test --workspace
|
- run: cargo test --workspace
|
||||||
|
|
@ -23,7 +23,7 @@ jobs:
|
||||||
name: Lint
|
name: Lint
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v5
|
||||||
- uses: dtolnay/rust-toolchain@stable
|
- uses: dtolnay/rust-toolchain@stable
|
||||||
with:
|
with:
|
||||||
components: clippy, rustfmt
|
components: clippy, rustfmt
|
||||||
|
|
@ -31,11 +31,26 @@ jobs:
|
||||||
- run: cargo fmt --check --all
|
- run: cargo fmt --check --all
|
||||||
- run: cargo clippy --all -- -D warnings
|
- run: cargo clippy --all -- -D warnings
|
||||||
|
|
||||||
|
wasm:
|
||||||
|
name: WASM
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v5
|
||||||
|
- uses: dtolnay/rust-toolchain@stable
|
||||||
|
with:
|
||||||
|
targets: wasm32-unknown-unknown
|
||||||
|
- uses: Swatinem/rust-cache@v2
|
||||||
|
# webclaw-core must stay WASM-safe (zero network deps, no threads).
|
||||||
|
# Check both with and without default features so the quickjs gate
|
||||||
|
# can't regress.
|
||||||
|
- run: cargo check --target wasm32-unknown-unknown -p webclaw-core
|
||||||
|
- run: cargo check --target wasm32-unknown-unknown -p webclaw-core --no-default-features
|
||||||
|
|
||||||
docs:
|
docs:
|
||||||
name: Docs
|
name: Docs
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v5
|
||||||
- uses: dtolnay/rust-toolchain@stable
|
- uses: dtolnay/rust-toolchain@stable
|
||||||
- uses: Swatinem/rust-cache@v2
|
- uses: Swatinem/rust-cache@v2
|
||||||
- run: cargo doc --no-deps --workspace
|
- run: cargo doc --no-deps --workspace
|
||||||
|
|
|
||||||
2
.github/workflows/deps.yml
vendored
2
.github/workflows/deps.yml
vendored
|
|
@ -14,7 +14,7 @@ jobs:
|
||||||
name: Update webclaw-tls dependencies
|
name: Update webclaw-tls dependencies
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v5
|
||||||
with:
|
with:
|
||||||
token: ${{ secrets.SYNC_PAT }}
|
token: ${{ secrets.SYNC_PAT }}
|
||||||
|
|
||||||
|
|
|
||||||
73
.github/workflows/release.yml
vendored
73
.github/workflows/release.yml
vendored
|
|
@ -5,14 +5,15 @@ on:
|
||||||
tags: ["v*"]
|
tags: ["v*"]
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: write
|
contents: read
|
||||||
packages: write
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
CARGO_TERM_COLOR: always
|
CARGO_TERM_COLOR: always
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
name: Build ${{ matrix.target }}
|
name: Build ${{ matrix.target }}
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
strategy:
|
strategy:
|
||||||
|
|
@ -27,9 +28,11 @@ jobs:
|
||||||
os: ubuntu-latest
|
os: ubuntu-latest
|
||||||
- target: aarch64-unknown-linux-gnu
|
- target: aarch64-unknown-linux-gnu
|
||||||
os: ubuntu-latest
|
os: ubuntu-latest
|
||||||
|
- target: x86_64-pc-windows-msvc
|
||||||
|
os: windows-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v5
|
||||||
|
|
||||||
- uses: dtolnay/rust-toolchain@stable
|
- uses: dtolnay/rust-toolchain@stable
|
||||||
with:
|
with:
|
||||||
|
|
@ -57,6 +60,12 @@ jobs:
|
||||||
if: matrix.target != 'aarch64-unknown-linux-gnu' && runner.os == 'Linux'
|
if: matrix.target != 'aarch64-unknown-linux-gnu' && runner.os == 'Linux'
|
||||||
run: sudo apt-get update && sudo apt-get install -y cmake
|
run: sudo apt-get update && sudo apt-get install -y cmake
|
||||||
|
|
||||||
|
- name: Install NASM (Windows)
|
||||||
|
if: runner.os == 'Windows'
|
||||||
|
run: |
|
||||||
|
choco install nasm -y
|
||||||
|
echo "C:\Program Files\NASM" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
run: cargo build --release --target ${{ matrix.target }}
|
run: cargo build --release --target ${{ matrix.target }}
|
||||||
|
|
||||||
|
|
@ -71,15 +80,25 @@ jobs:
|
||||||
# don't repeat that mistake. If a future binary gets renamed or
|
# don't repeat that mistake. If a future binary gets renamed or
|
||||||
# removed, this step should scream, not quietly publish an
|
# removed, this step should scream, not quietly publish an
|
||||||
# incomplete release.
|
# incomplete release.
|
||||||
cp target/${{ matrix.target }}/release/webclaw "$staging/"
|
|
||||||
cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/"
|
if [[ "${{ matrix.os }}" == "windows-latest" ]]; then
|
||||||
cp target/${{ matrix.target }}/release/webclaw-server "$staging/"
|
cp target/${{ matrix.target }}/release/webclaw.exe "$staging/"
|
||||||
cp README.md LICENSE "$staging/"
|
cp target/${{ matrix.target }}/release/webclaw-mcp.exe "$staging/"
|
||||||
tar czf "$staging.tar.gz" "$staging"
|
cp target/${{ matrix.target }}/release/webclaw-server.exe "$staging/"
|
||||||
echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV
|
cp README.md LICENSE "$staging/"
|
||||||
|
7z a -tzip "$staging.zip" "$staging"
|
||||||
|
echo "ASSET=$staging.zip" >> $GITHUB_ENV
|
||||||
|
else
|
||||||
|
cp target/${{ matrix.target }}/release/webclaw "$staging/"
|
||||||
|
cp target/${{ matrix.target }}/release/webclaw-mcp "$staging/"
|
||||||
|
cp target/${{ matrix.target }}/release/webclaw-server "$staging/"
|
||||||
|
cp README.md LICENSE "$staging/"
|
||||||
|
tar czf "$staging.tar.gz" "$staging"
|
||||||
|
echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV
|
||||||
|
fi
|
||||||
|
|
||||||
- name: Upload artifact
|
- name: Upload artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v5
|
||||||
with:
|
with:
|
||||||
name: ${{ matrix.target }}
|
name: ${{ matrix.target }}
|
||||||
path: ${{ env.ASSET }}
|
path: ${{ env.ASSET }}
|
||||||
|
|
@ -88,10 +107,10 @@ jobs:
|
||||||
name: Release
|
name: Release
|
||||||
needs: build
|
needs: build
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/download-artifact@v5
|
||||||
|
|
||||||
- uses: actions/download-artifact@v4
|
|
||||||
with:
|
with:
|
||||||
path: artifacts
|
path: artifacts
|
||||||
|
|
||||||
|
|
@ -99,23 +118,31 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
cd artifacts
|
cd artifacts
|
||||||
find . -name '*.tar.gz' -exec mv {} . \;
|
find . -name '*.tar.gz' -exec mv {} . \;
|
||||||
sha256sum *.tar.gz > SHA256SUMS
|
find . -name '*.zip' -exec mv {} . \;
|
||||||
|
sha256sum *.tar.gz *.zip > SHA256SUMS 2>/dev/null || sha256sum * > SHA256SUMS
|
||||||
cat SHA256SUMS
|
cat SHA256SUMS
|
||||||
|
|
||||||
- name: Create GitHub Release
|
- name: Create GitHub Release
|
||||||
uses: softprops/action-gh-release@v2
|
env:
|
||||||
with:
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
generate_release_notes: true
|
run: |
|
||||||
files: |
|
tag="${GITHUB_REF#refs/tags/}"
|
||||||
artifacts/*.tar.gz
|
gh release create "$tag" \
|
||||||
artifacts/SHA256SUMS
|
artifacts/*.tar.gz \
|
||||||
|
artifacts/*.zip \
|
||||||
|
artifacts/SHA256SUMS \
|
||||||
|
--repo "$GITHUB_REPOSITORY" \
|
||||||
|
--generate-notes
|
||||||
|
|
||||||
docker:
|
docker:
|
||||||
name: Docker
|
name: Docker
|
||||||
needs: release
|
needs: release
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
packages: write
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v5
|
||||||
|
|
||||||
- uses: docker/setup-qemu-action@v3
|
- uses: docker/setup-qemu-action@v3
|
||||||
with:
|
with:
|
||||||
|
|
@ -173,6 +200,8 @@ jobs:
|
||||||
name: Update Homebrew
|
name: Update Homebrew
|
||||||
needs: [release, docker]
|
needs: [release, docker]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
steps:
|
steps:
|
||||||
- name: Compute all checksums and update formula
|
- name: Compute all checksums and update formula
|
||||||
env:
|
env:
|
||||||
|
|
@ -181,7 +210,7 @@ jobs:
|
||||||
tag="${GITHUB_REF#refs/tags/}"
|
tag="${GITHUB_REF#refs/tags/}"
|
||||||
base="https://github.com/0xMassi/webclaw/releases/download/${tag}"
|
base="https://github.com/0xMassi/webclaw/releases/download/${tag}"
|
||||||
|
|
||||||
# Download all 4 tarballs and compute SHAs
|
# Download all tarballs (Linux + macOS) and compute SHAs
|
||||||
for target in aarch64-apple-darwin x86_64-apple-darwin aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu; do
|
for target in aarch64-apple-darwin x86_64-apple-darwin aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu; do
|
||||||
curl -sSL "${base}/webclaw-${tag}-${target}.tar.gz" -o "${target}.tar.gz"
|
curl -sSL "${base}/webclaw-${tag}-${target}.tar.gz" -o "${target}.tar.gz"
|
||||||
done
|
done
|
||||||
|
|
|
||||||
126
CHANGELOG.md
126
CHANGELOG.md
|
|
@ -3,6 +3,132 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [0.6.5] — 2026-06-04
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Reddit threads extract reliably again. The old anonymous JSON endpoint is no longer available, so webclaw now reads old.reddit.com directly without an API key or JavaScript. You get the post plus the full nested comment tree, with authors, scores, timestamps, and reply nesting preserved. Comment text keeps its links and code blocks, hidden scores are reported as unknown rather than zero, and deleted comments stay in place so their replies aren't lost.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.6.4] — 2026-05-19
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- API surface discovery: a new module extracts the API endpoints embedded in a page's inline scripts and linked JavaScript bundles. It surfaces relative REST paths, absolute URLs, GraphQL operations, and WebSocket endpoints that a sitemap alone cannot reveal. A built-in noise filter drops schema.org and json-schema.org references, bare framework paths, and other non-API matches so the result stays focused on the real surface.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.6.3] — 2026-05-19
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Hardened resource and path-safety limits across the CLI, MCP server, and self-hosted API: oversized or highly compressed responses are capped while streaming, deeply nested page data can no longer exhaust memory, output filenames stay inside the chosen directory, webhook URLs are validated like every other fetch, and multibyte search queries no longer crash slug generation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.6.2] — 2026-05-18
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Cleaned up `--format llm` output on noisy news and documentation pages. Comment-count links, bare page-number paragraphs, pagination leftovers such as `0 Next`, and duplicated JSON-LD article bodies are now removed before they reach the LLM context.
|
||||||
|
- The CLI now recognizes common cookie-consent redirects and prints a clearer warning when a page returns a consent wall instead of usable content.
|
||||||
|
- The CLI keeps noisy parser warnings from real-world malformed HTML out of stderr by default. `WEBCLAW_LOG` still lets advanced users opt into deeper parser logs.
|
||||||
|
|
||||||
|
Thanks to Nenad Oric (`@devnen`) for the report and patch work in PR #43.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.6.1] — 2026-05-12
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Hardened URL safety across the CLI, MCP server, and self-hosted API paths so local and private network targets are rejected more consistently, including after DNS resolution and redirects.
|
||||||
|
- Added a timeout around inline JavaScript data extraction so hostile pages cannot keep the extractor busy forever.
|
||||||
|
- Tightened Amazon and eBay URL recognition so deceptive hosts are rejected while common international marketplaces still work.
|
||||||
|
- Avoided unnecessary decoding work on large responses during bot-challenge detection.
|
||||||
|
- Reduced release workflow token permissions so build jobs run with narrower GitHub access.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.6.0] — 2026-05-10
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Improved `--format llm` output quality on modern news and documentation pages. Framework hydration blobs and low-value page chrome structured-data records are now filtered out before they can flood the LLM context, while content-bearing Schema.org records are preserved. Thanks and congrats to Nenad Oric (`@devnen`) for the contribution in PR #37.
|
||||||
|
- Fixed element-to-text spacing so adjacent inline nodes no longer smash words together, while punctuation stays attached on real pages such as docs, forums, and reference sites.
|
||||||
|
- Removed common screen-reader-only link chrome such as "opens new tab" from LLM body text and link labels without stripping ordinary prose that happens to mention external links.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.5.9] — 2026-05-06
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- LLM providers now support `ANTHROPIC_BASE_URL` for Anthropic-compatible proxies, plus an `OPENAI_RESPONSE_FORMAT_TYPE` override for OpenAI-compatible backends such as LM Studio. Thanks to Toti (`@Toti330`) for the report.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.5.8] — 2026-05-04
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- GitHub Releases now include a Windows x86_64 `.zip` with `webclaw.exe`, `webclaw-mcp.exe`, and `webclaw-server.exe`. Thanks to Suryansh Mishra (`@notrealsuryansh`) for the contribution.
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Improved brand extraction results for modern sites with large app shells. Brand colors, fonts, and logos are now less likely to be polluted by login widgets, customer-logo grids, icon fonts, or generated CSS noise.
|
||||||
|
|
||||||
|
### Docs
|
||||||
|
- Refreshed the README badges with a cleaner shieldcn style. Thanks to Justin Levine (`@jal-co`) for the contribution, and shout-out to his open-source [shieldcn](https://github.com/jal-co/shieldcn) project.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.5.7] — 2026-04-30
|
||||||
|
|
||||||
|
### Security
|
||||||
|
- Hardened server-side URL fetching against SSRF by rejecting private/internal IP ranges and unsafe redirect targets across CLI, MCP, and the self-hosted REST server. Thanks to KairoKid / dodge1218 (vonbrubeck@gmail.com) for the responsible report.
|
||||||
|
|
||||||
|
### Docs
|
||||||
|
- README header now uses an `<h1>webclaw</h1>` instead of an `<h3>` slogan. The repo had no heading-level brand anchor before, only a banner image, so search engines indexing the README were missing the canonical brand signal. The new heading is what GitHub renders as the title of the page and what Google co-ranks with webclaw.io.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.5.6] — 2026-04-23
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- `FetchClient::fetch_smart(url)` applies per-site rescue logic and returns the same `FetchResult` shape as `fetch()`. Reddit URLs route to the `.json` API with an identifiable bot `User-Agent`, and Akamai-style challenge pages trigger a homepage cookie warmup plus a retry. Makes `/v1/scrape` on Reddit populate markdown again.
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Regression introduced in 0.5.4 where the production server's `/v1/scrape` bypassed the Reddit `.json` shortcut and Akamai cookie warmup that `fetch_and_extract` had been providing. Both helpers now live in `fetch_smart` and every caller path picks them up.
|
||||||
|
- Panic in the markdown converter (`markdown.rs:925`) on single-pipe `|` lines. A `[1..len-1]` slice on a 1-char input triggered `begin <= end`. Guarded.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.5.5] — 2026-04-23
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- `webclaw --browser safari-ios` on the CLI. Pairs with `--proxy` for DataDome-fronted sites that reject desktop profiles.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.5.4] — 2026-04-23
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- New `BrowserProfile::SafariIos` for Safari iOS 26 fingerprinting. Pairs with a country-matched residential proxy for sites that reject non-mobile profiles.
|
||||||
|
- `accept_language_for_url(url)` and `accept_language_for_tld(tld)` helpers. Returns a locale-appropriate `Accept-Language` based on the URL's TLD, with `en-US` as the fallback.
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Chrome browser fingerprint refreshed for current Cloudflare bot management. Fixes 403 challenges on several e-commerce and jobs sites.
|
||||||
|
- Bumped `wreq-util` to `3.0.0-rc.10`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## [0.5.2] — 2026-04-22
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **`webclaw vertical <name> <url>` subcommand on the CLI.** Runs a specific vertical extractor and prints typed JSON (pretty-printed by default, `--raw` for single-line). Example: `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/` returns `{post: {title, author, points, ...}, comments: [...]}`. URL-mismatch errors surface cleanly as `"URL '...' does not match the '...' extractor"` on stderr with exit code 1.
|
||||||
|
|
||||||
|
- **`webclaw extractors` subcommand on the CLI.** Lists all 28 vertical extractors with name, label, and one URL pattern sample. `--json` emits the full catalog as JSON (same shape as `GET /v1/extractors`) for tooling. Covers discovery for users who don't know which vertical to pick.
|
||||||
|
|
||||||
|
- **`vertical_scrape` and `list_extractors` tools on `webclaw-mcp`.** Claude Desktop / Claude Code users can now call any of the 28 extractors by name from an MCP session. Tool count goes from 10 to 12. `list_extractors` takes no args and returns the full catalog; `vertical_scrape` takes `{name, url}` and returns the typed JSON payload. Antibot-gated verticals still auto-escalate to the webclaw cloud API when `WEBCLAW_API_KEY` is set.
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Server-info instruction string in `webclaw-mcp` now lists all 12 tools (previously hard-coded 10). Also `webclaw --help` on the CLI now shows the three subcommands: `bench`, `extractors`, `vertical`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.5.1] — 2026-04-22
|
## [0.5.1] — 2026-04-22
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,7 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R
|
||||||
- `filter.rs` — CSS selector include/exclude filtering (ExtractionOptions)
|
- `filter.rs` — CSS selector include/exclude filtering (ExtractionOptions)
|
||||||
- `diff.rs` — Content change tracking engine (snapshot diffing)
|
- `diff.rs` — Content change tracking engine (snapshot diffing)
|
||||||
- `brand.rs` — Brand identity extraction from DOM structure and CSS
|
- `brand.rs` — Brand identity extraction from DOM structure and CSS
|
||||||
|
- `youtube.rs` — `ytInitialPlayerResponse` parser, structured markdown for `youtube.com/watch` URLs (title, channel, views, published, duration, description). Produces the legacy markdown shape — for transcripts and a structured `YoutubeData` block see the production server's `youtube_transcript.rs` short-circuit (yt-dlp via proxy pool).
|
||||||
|
|
||||||
### Fetch Modules (`webclaw-fetch`)
|
### Fetch Modules (`webclaw-fetch`)
|
||||||
- `client.rs` — FetchClient with wreq BoringSSL TLS impersonation; implements the public `Fetcher` trait so callers (including server adapters) can swap in alternative implementations
|
- `client.rs` — FetchClient with wreq BoringSSL TLS impersonation; implements the public `Fetcher` trait so callers (including server adapters) can swap in alternative implementations
|
||||||
|
|
@ -79,7 +80,7 @@ Three binaries: `webclaw` (CLI), `webclaw-mcp` (MCP server), `webclaw-server` (R
|
||||||
- **webclaw-fetch uses wreq 6.x** (BoringSSL). No `[patch.crates-io]` forks needed; wreq handles TLS internally.
|
- **webclaw-fetch uses wreq 6.x** (BoringSSL). No `[patch.crates-io]` forks needed; wreq handles TLS internally.
|
||||||
- **No special RUSTFLAGS** — `.cargo/config.toml` is currently empty of build flags. Don't add any.
|
- **No special RUSTFLAGS** — `.cargo/config.toml` is currently empty of build flags. Don't add any.
|
||||||
- **webclaw-llm uses plain reqwest**. LLM APIs don't need TLS fingerprinting, so no wreq dep.
|
- **webclaw-llm uses plain reqwest**. LLM APIs don't need TLS fingerprinting, so no wreq dep.
|
||||||
- **Vertical extractors take `&dyn Fetcher`**, not `&FetchClient`. This lets the production server plug in a `TlsSidecarFetcher` that routes through the Go tls-sidecar instead of in-process wreq.
|
- **Vertical extractors take `&dyn Fetcher`**, not `&FetchClient`. This lets the production server plug in a `ProductionFetcher` that adds domain_hints routing and antibot escalation on top of the same wreq client.
|
||||||
- **qwen3 thinking tags** (`<think>`) are stripped at both provider and consumer levels.
|
- **qwen3 thinking tags** (`<think>`) are stripped at both provider and consumer levels.
|
||||||
|
|
||||||
## Build & Test
|
## Build & Test
|
||||||
|
|
|
||||||
|
|
@ -91,18 +91,16 @@ Body is optional but encouraged for non-trivial changes.
|
||||||
|
|
||||||
```
|
```
|
||||||
webclaw (this repo)
|
webclaw (this repo)
|
||||||
├── crates/
|
└── crates/
|
||||||
│ ├── webclaw-core/ # Pure extraction engine (HTML → markdown/json/text)
|
├── webclaw-core/ # Pure extraction engine (HTML → markdown/json/text)
|
||||||
│ ├── webclaw-fetch/ # HTTP client + crawler + sitemap + batch
|
├── webclaw-fetch/ # HTTP client (wreq/BoringSSL) + crawler + sitemap + batch
|
||||||
│ ├── webclaw-llm/ # LLM provider chain (Ollama → OpenAI → Anthropic)
|
├── webclaw-llm/ # LLM provider chain (Ollama → OpenAI → Anthropic)
|
||||||
│ ├── webclaw-pdf/ # PDF text extraction
|
├── webclaw-pdf/ # PDF text extraction
|
||||||
│ ├── webclaw-cli/ # CLI binary
|
├── webclaw-cli/ # CLI binary
|
||||||
│ └── webclaw-mcp/ # MCP server binary
|
└── webclaw-mcp/ # MCP server binary
|
||||||
│
|
|
||||||
└── [patch.crates-io] # Points to webclaw-tls for TLS fingerprinting
|
|
||||||
```
|
```
|
||||||
|
|
||||||
TLS fingerprinting lives in a separate repo: [webclaw-tls](https://github.com/0xMassi/webclaw-tls). The `[patch.crates-io]` section in `Cargo.toml` overrides rustls, h2, hyper, hyper-util, and reqwest with our patched forks for browser-grade JA4 + HTTP/2 Akamai fingerprinting.
|
TLS fingerprinting is handled in-process by [wreq](https://crates.io/crates/wreq) (BoringSSL), so `webclaw-fetch` impersonates real browser TLS directly. There are no `[patch.crates-io]` forks or external TLS dependencies.
|
||||||
|
|
||||||
## Crate Boundaries
|
## Crate Boundaries
|
||||||
|
|
||||||
|
|
@ -111,7 +109,7 @@ Changes that cross crate boundaries need extra care:
|
||||||
| Crate | Network? | Key constraint |
|
| Crate | Network? | Key constraint |
|
||||||
|-------|----------|----------------|
|
|-------|----------|----------------|
|
||||||
| webclaw-core | No | Zero network deps, WASM-safe |
|
| webclaw-core | No | Zero network deps, WASM-safe |
|
||||||
| webclaw-fetch | Yes (webclaw-http) | Uses [webclaw-tls](https://github.com/0xMassi/webclaw-tls) for TLS fingerprinting |
|
| webclaw-fetch | Yes (wreq) | Browser TLS impersonation via wreq (BoringSSL); no patched deps |
|
||||||
| webclaw-llm | Yes (reqwest) | Plain reqwest — LLM APIs don't need TLS fingerprinting |
|
| webclaw-llm | Yes (reqwest) | Plain reqwest — LLM APIs don't need TLS fingerprinting |
|
||||||
| webclaw-pdf | No | Minimal, wraps pdf-extract |
|
| webclaw-pdf | No | Minimal, wraps pdf-extract |
|
||||||
| webclaw-cli | Yes | Depends on all above |
|
| webclaw-cli | Yes | Depends on all above |
|
||||||
|
|
|
||||||
45
Cargo.lock
generated
45
Cargo.lock
generated
|
|
@ -2967,6 +2967,26 @@ dependencies = [
|
||||||
"pom",
|
"pom",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "typed-builder"
|
||||||
|
version = "0.23.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "31aa81521b70f94402501d848ccc0ecaa8f93c8eb6999eb9747e72287757ffda"
|
||||||
|
dependencies = [
|
||||||
|
"typed-builder-macro",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "typed-builder-macro"
|
||||||
|
version = "0.23.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "076a02dc54dd46795c2e9c8282ed40bcfb1e22747e955de9389a1de28190fb26"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "typed-path"
|
name = "typed-path"
|
||||||
version = "0.12.3"
|
version = "0.12.3"
|
||||||
|
|
@ -3199,7 +3219,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.5.1"
|
version = "0.6.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3220,7 +3240,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.5.1"
|
version = "0.6.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -3238,7 +3258,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.5.1"
|
version = "0.6.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes",
|
"bytes",
|
||||||
|
|
@ -3258,12 +3278,13 @@ dependencies = [
|
||||||
"webclaw-core",
|
"webclaw-core",
|
||||||
"webclaw-pdf",
|
"webclaw-pdf",
|
||||||
"wreq",
|
"wreq",
|
||||||
|
"wreq-util",
|
||||||
"zip 2.4.2",
|
"zip 2.4.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.5.1"
|
version = "0.6.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3276,7 +3297,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.5.1"
|
version = "0.6.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dirs",
|
"dirs",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3296,7 +3317,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.5.1"
|
version = "0.6.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
@ -3305,7 +3326,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-server"
|
name = "webclaw-server"
|
||||||
version = "0.5.1"
|
version = "0.6.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"axum",
|
"axum",
|
||||||
|
|
@ -3709,6 +3730,16 @@ dependencies = [
|
||||||
"zstd",
|
"zstd",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wreq-util"
|
||||||
|
version = "3.0.0-rc.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6c6bbe24d28beb9ceb58b514bd6a613c759d3b706f768b9d2950d5d35b543c04"
|
||||||
|
dependencies = [
|
||||||
|
"typed-builder",
|
||||||
|
"wreq",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "writeable"
|
name = "writeable"
|
||||||
version = "0.6.2"
|
version = "0.6.2"
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
||||||
members = ["crates/*"]
|
members = ["crates/*"]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "0.5.1"
|
version = "0.6.5"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "AGPL-3.0"
|
license = "AGPL-3.0"
|
||||||
repository = "https://github.com/0xMassi/webclaw"
|
repository = "https://github.com/0xMassi/webclaw"
|
||||||
|
|
@ -21,4 +21,3 @@ tracing = "0.1"
|
||||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
clap = { version = "4", features = ["derive", "env"] }
|
clap = { version = "4", features = ["derive", "env"] }
|
||||||
dotenvy = "0.15"
|
dotenvy = "0.15"
|
||||||
|
|
||||||
|
|
|
||||||
14
Dockerfile
14
Dockerfile
|
|
@ -59,9 +59,9 @@ RUN touch crates/*/src/*.rs \
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
FROM ubuntu:24.04
|
FROM ubuntu:24.04
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
# CA bundle from distroless (ships it, multi-arch, gcr.io) instead of
|
||||||
ca-certificates \
|
# apt-installing from ports.ubuntu.com (unreachable for arm64 on CI runners).
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
||||||
|
|
||||||
# Copy all three binaries
|
# Copy all three binaries
|
||||||
COPY --from=builder /build/target/release/webclaw /usr/local/bin/webclaw
|
COPY --from=builder /build/target/release/webclaw /usr/local/bin/webclaw
|
||||||
|
|
@ -73,11 +73,9 @@ COPY --from=builder /build/target/release/webclaw-server /usr/local/bin/webclaw-
|
||||||
# as documentation; callers still need `-p 3000:3000` on `docker run`.
|
# as documentation; callers still need `-p 3000:3000` on `docker run`.
|
||||||
EXPOSE 3000
|
EXPOSE 3000
|
||||||
|
|
||||||
# Container default: bind all interfaces so `-p 3000:3000` works. The binary
|
# Container default: bind all interfaces so `-p 3000:3000` works. Public
|
||||||
# itself defaults to 127.0.0.1 (safe for `cargo run` on a laptop); inside
|
# binding requires WEBCLAW_API_KEY; the binary refuses open-auth 0.0.0.0
|
||||||
# Docker that would make the server unreachable, so we flip it here.
|
# unless WEBCLAW_ALLOW_OPEN_PUBLIC=1 is set explicitly for local testing.
|
||||||
# Override with -e WEBCLAW_HOST=127.0.0.1 if you front this with another
|
|
||||||
# process in the same container.
|
|
||||||
ENV WEBCLAW_HOST=0.0.0.0
|
ENV WEBCLAW_HOST=0.0.0.0
|
||||||
|
|
||||||
# Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other
|
# Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other
|
||||||
|
|
|
||||||
|
|
@ -5,9 +5,10 @@ ARG BINARY_DIR=binaries
|
||||||
|
|
||||||
FROM ubuntu:24.04
|
FROM ubuntu:24.04
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
# CA bundle copied from a reliable multi-arch image instead of apt-installing
|
||||||
ca-certificates \
|
# from ports.ubuntu.com — Canonical's arm64 ports mirror is unreachable from
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
# CI runners and breaks the multi-arch release build. No build-time network.
|
||||||
|
COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
|
||||||
|
|
||||||
ARG BINARY_DIR
|
ARG BINARY_DIR
|
||||||
COPY ${BINARY_DIR}/webclaw /usr/local/bin/webclaw
|
COPY ${BINARY_DIR}/webclaw /usr/local/bin/webclaw
|
||||||
|
|
|
||||||
682
README.md
682
README.md
|
|
@ -1,70 +1,74 @@
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<a href="https://webclaw.io">
|
<a href="https://webclaw.io">
|
||||||
<img src=".github/banner.png" alt="webclaw" width="700" />
|
<img src=".github/banner.png" alt="webclaw" width="760" />
|
||||||
</a>
|
</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<h3 align="center">
|
<h1 align="center">webclaw</h1>
|
||||||
The fastest web scraper for AI agents.<br/>
|
|
||||||
<sub>67% fewer tokens. Sub-millisecond extraction. Zero browser overhead.</sub>
|
|
||||||
</h3>
|
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<a href="https://github.com/0xMassi/webclaw/stargazers"><img src="https://img.shields.io/github/stars/0xMassi/webclaw?style=for-the-badge&logo=github&logoColor=white&label=Stars&color=181717" alt="Stars" /></a>
|
<strong>Turn websites into clean markdown, JSON, and LLM-ready context.</strong><br/>
|
||||||
<a href="https://github.com/0xMassi/webclaw/releases"><img src="https://img.shields.io/github/v/release/0xMassi/webclaw?style=for-the-badge&logo=rust&logoColor=white&label=Version&color=B7410E" alt="Version" /></a>
|
<sub>CLI, MCP server, REST API, and SDKs for AI agents and RAG pipelines.</sub>
|
||||||
<a href="https://github.com/0xMassi/webclaw/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-AGPL--3.0-10B981?style=for-the-badge" alt="License" /></a>
|
|
||||||
<a href="https://www.npmjs.com/package/create-webclaw"><img src="https://img.shields.io/npm/dt/create-webclaw?style=for-the-badge&logo=npm&logoColor=white&label=Installs&color=CB3837" alt="npm installs" /></a>
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<a href="https://discord.gg/KDfd48EpnW"><img src="https://img.shields.io/badge/Discord-Join-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Discord" /></a>
|
<a href="https://github.com/0xMassi/webclaw/stargazers"><img src="https://shieldcn.dev/github/stars/0xMassi/webclaw.svg?variant=branded&logo=github" alt="Stars" /></a>
|
||||||
<a href="https://x.com/webclaw_io"><img src="https://img.shields.io/badge/Follow-@webclaw__io-000000?style=for-the-badge&logo=x&logoColor=white" alt="X / Twitter" /></a>
|
<a href="https://github.com/0xMassi/webclaw/releases"><img src="https://shieldcn.dev/github/tag/0xMassi/webclaw.svg?variant=branded&logo=rust" alt="Version" /></a>
|
||||||
<a href="https://webclaw.io"><img src="https://img.shields.io/badge/Website-webclaw.io-0A0A0A?style=for-the-badge&logo=safari&logoColor=white" alt="Website" /></a>
|
<a href="https://github.com/0xMassi/webclaw/blob/main/LICENSE"><img src="https://shieldcn.dev/github/license/0xMassi/webclaw.svg?variant=branded" alt="License" /></a>
|
||||||
<a href="https://webclaw.io/docs"><img src="https://img.shields.io/badge/Docs-Read-3B82F6?style=for-the-badge&logo=readthedocs&logoColor=white" alt="Docs" /></a>
|
<a href="https://www.npmjs.com/package/create-webclaw"><img src="https://shieldcn.dev/npm/dt/create-webclaw.svg?variant=branded" alt="npm installs" /></a>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<a href="https://discord.gg/KDfd48EpnW"><img src="https://shieldcn.dev/badge/Discord-Join.svg?variant=branded&logo=discord" alt="Discord" /></a>
|
||||||
|
<a href="https://x.com/webclaw_io"><img src="https://shieldcn.dev/badge/Follow-@webclaw__io.svg?variant=branded&logo=x" alt="X / Twitter" /></a>
|
||||||
|
<a href="https://webclaw.io"><img src="https://shieldcn.dev/badge/Hosted-webclaw.io.svg?variant=branded&logo=safari" alt="Hosted webclaw" /></a>
|
||||||
|
<a href="https://webclaw.io/docs"><img src="https://shieldcn.dev/badge/Docs-Read.svg?variant=branded&logo=readthedocs" alt="Docs" /></a>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="assets/demo.gif" alt="webclaw extracting clean markdown from a page" width="760" />
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
<p align="center">
|
Most web scraping tools give your agent one of two bad outputs:
|
||||||
<img src="assets/demo.gif" alt="Claude Code: web_fetch gets 403, webclaw extracts successfully" width="700" />
|
|
||||||
<br/>
|
|
||||||
<sub>Claude Code's built-in web_fetch → 403 Forbidden. webclaw → clean markdown.</sub>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
---
|
- a blocked page, login wall, or empty app shell
|
||||||
|
- raw HTML full of nav, scripts, styling, ads, and duplicated boilerplate
|
||||||
|
|
||||||
Your AI agent calls `fetch()` and gets a 403. Or 142KB of raw HTML that burns through your token budget. **webclaw fixes both.**
|
[webclaw.io](https://webclaw.io) is the hosted web extraction API for webclaw. This repo contains the open-source CLI, MCP server, extraction engine, and self-hostable server.
|
||||||
|
|
||||||
It extracts clean, structured content from any URL using Chrome-level TLS fingerprinting — no headless browser, no Selenium, no Puppeteer. Output is optimized for LLMs: **67% fewer tokens** than raw HTML, with metadata, links, and images preserved.
|
webclaw turns a URL into clean content your tools can actually use.
|
||||||
|
|
||||||
```
|
```bash
|
||||||
Raw HTML webclaw
|
webclaw https://example.com --format markdown
|
||||||
┌──────────────────────────────────┐ ┌──────────────────────────────────┐
|
|
||||||
│ <div class="ad-wrapper"> │ │ # Breaking: AI Breakthrough │
|
|
||||||
│ <nav class="global-nav"> │ │ │
|
|
||||||
│ <script>window.__NEXT_DATA__ │ │ Researchers achieved 94% │
|
|
||||||
│ ={...8KB of JSON...}</script> │ │ accuracy on cross-domain │
|
|
||||||
│ <div class="social-share"> │ │ reasoning benchmarks. │
|
|
||||||
│ <button>Tweet</button> │ │ │
|
|
||||||
│ <footer class="site-footer"> │ │ ## Key Findings │
|
|
||||||
│ <!-- 142,847 characters --> │ │ - 3x faster inference │
|
|
||||||
│ │ │ - Open-source weights │
|
|
||||||
│ 4,820 tokens │ │ 1,590 tokens │
|
|
||||||
└──────────────────────────────────┘ └──────────────────────────────────┘
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```md
|
||||||
|
# Example Domain
|
||||||
|
|
||||||
|
This domain is for use in illustrative examples in documents.
|
||||||
|
|
||||||
|
You may use this domain in literature without prior coordination or asking for permission.
|
||||||
|
```
|
||||||
|
|
||||||
|
Use it from the terminal, wire it into Claude/Cursor through MCP, call the hosted API from your app, or self-host the OSS server.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Get Started (30 seconds)
|
## Install
|
||||||
|
|
||||||
### For AI agents (Claude, Cursor, Windsurf, VS Code)
|
### Agent setup
|
||||||
|
|
||||||
|
The fastest way to connect webclaw to Claude Code, Claude Desktop, Cursor, Windsurf, OpenCode, Codex CLI, and other MCP-compatible tools:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
npx create-webclaw
|
npx create-webclaw
|
||||||
```
|
```
|
||||||
|
|
||||||
Auto-detects your AI tools, downloads the MCP server, and configures everything. One command.
|
The installer detects supported clients and configures the MCP server for you.
|
||||||
|
|
||||||
### Homebrew (macOS/Linux)
|
### Homebrew
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
brew tap 0xMassi/webclaw
|
brew tap 0xMassi/webclaw
|
||||||
|
|
@ -73,14 +77,7 @@ brew install webclaw
|
||||||
|
|
||||||
### Prebuilt binaries
|
### Prebuilt binaries
|
||||||
|
|
||||||
Download from [GitHub Releases](https://github.com/0xMassi/webclaw/releases) for macOS (arm64, x86_64) and Linux (x86_64, aarch64).
|
Download macOS and Linux binaries from [GitHub Releases](https://github.com/0xMassi/webclaw/releases).
|
||||||
|
|
||||||
### Cargo (from source)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cargo install --git https://github.com/0xMassi/webclaw.git webclaw-cli
|
|
||||||
cargo install --git https://github.com/0xMassi/webclaw.git webclaw-mcp
|
|
||||||
```
|
|
||||||
|
|
||||||
### Docker
|
### Docker
|
||||||
|
|
||||||
|
|
@ -88,89 +85,90 @@ cargo install --git https://github.com/0xMassi/webclaw.git webclaw-mcp
|
||||||
docker run --rm ghcr.io/0xmassi/webclaw https://example.com
|
docker run --rm ghcr.io/0xmassi/webclaw https://example.com
|
||||||
```
|
```
|
||||||
|
|
||||||
### Docker Compose (with Ollama for LLM features)
|
### Cargo
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cp env.example .env
|
cargo install --git https://github.com/0xMassi/webclaw.git webclaw-cli
|
||||||
docker compose up -d
|
cargo install --git https://github.com/0xMassi/webclaw.git webclaw-mcp
|
||||||
|
```
|
||||||
|
|
||||||
|
If building from source fails because native build tools are missing, install the platform prerequisites:
|
||||||
|
|
||||||
|
| OS | Command |
|
||||||
|
| --- | --- |
|
||||||
|
| Debian / Ubuntu | `sudo apt install -y pkg-config libssl-dev cmake clang git build-essential` |
|
||||||
|
| Fedora / RHEL | `sudo dnf install -y pkg-config openssl-devel cmake clang git make gcc` |
|
||||||
|
| Arch | `sudo pacman -S pkg-config openssl cmake clang git base-devel` |
|
||||||
|
| macOS | `xcode-select --install` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### Scrape one page
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://stripe.com --format markdown
|
||||||
|
```
|
||||||
|
|
||||||
|
### Return LLM-optimized text
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://docs.anthropic.com --format llm
|
||||||
|
```
|
||||||
|
|
||||||
|
### Keep only the main content
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://example.com/blog/post --only-main-content
|
||||||
|
```
|
||||||
|
|
||||||
|
### Include or exclude selectors
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://example.com \
|
||||||
|
--include "article, main, .content" \
|
||||||
|
--exclude "nav, footer, .sidebar, .ad"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Crawl a documentation site
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://docs.rust-lang.org --crawl --depth 2 --max-pages 50
|
||||||
|
```
|
||||||
|
|
||||||
|
### Workflow examples
|
||||||
|
|
||||||
|
- [HTML to Markdown for RAG](examples/html-to-markdown-rag/)
|
||||||
|
- [Firecrawl-compatible API](examples/firecrawl-compatible-api/)
|
||||||
|
- [MCP web scraping](examples/mcp-web-scraping/)
|
||||||
|
- [Proxy-backed crawling](examples/proxy-backed-crawling/)
|
||||||
|
- [Cloudflare diagnostics](examples/cloudflare-diagnostics/)
|
||||||
|
|
||||||
|
### Extract brand assets
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://github.com --brand
|
||||||
|
```
|
||||||
|
|
||||||
|
### Compare a page over time
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://example.com/pricing --format json > pricing-old.json
|
||||||
|
webclaw https://example.com/pricing --diff-with pricing-old.json
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Why webclaw?
|
## MCP Server
|
||||||
|
|
||||||
| | webclaw | Firecrawl | Trafilatura | Readability |
|
webclaw ships with an MCP server for AI agents.
|
||||||
|---|:---:|:---:|:---:|:---:|
|
|
||||||
| **Extraction accuracy** | **95.1%** | — | 80.6% | 83.5% |
|
|
||||||
| **Token efficiency** | **-67%** | — | -55% | -51% |
|
|
||||||
| **Speed (100KB page)** | **3.2ms** | ~500ms | 18.4ms | 8.7ms |
|
|
||||||
| **TLS fingerprinting** | Yes | No | No | No |
|
|
||||||
| **Self-hosted** | Yes | No | Yes | Yes |
|
|
||||||
| **MCP (Claude/Cursor)** | Yes | No | No | No |
|
|
||||||
| **No browser required** | Yes | No | Yes | Yes |
|
|
||||||
| **Cost** | Free | $$$$ | Free | Free |
|
|
||||||
|
|
||||||
**Choose webclaw if** you want fast local extraction, LLM-optimized output, and native AI agent integration.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## What it looks like
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ webclaw https://stripe.com -f llm
|
npx create-webclaw
|
||||||
|
|
||||||
> URL: https://stripe.com
|
|
||||||
> Title: Stripe | Financial Infrastructure for the Internet
|
|
||||||
> Language: en
|
|
||||||
> Word count: 847
|
|
||||||
|
|
||||||
# Stripe | Financial Infrastructure for the Internet
|
|
||||||
|
|
||||||
Stripe is a suite of APIs powering online payment processing
|
|
||||||
and commerce solutions for internet businesses of all sizes.
|
|
||||||
|
|
||||||
## Products
|
|
||||||
- Payments — Accept payments online and in person
|
|
||||||
- Billing — Manage subscriptions and invoicing
|
|
||||||
- Connect — Build a marketplace or platform
|
|
||||||
...
|
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
Manual config:
|
||||||
$ webclaw https://github.com --brand
|
|
||||||
|
|
||||||
{
|
|
||||||
"name": "GitHub",
|
|
||||||
"colors": [{"hex": "#59636E", "usage": "Primary"}, ...],
|
|
||||||
"fonts": ["Mona Sans", "ui-monospace"],
|
|
||||||
"logos": [{"url": "https://github.githubassets.com/...", "kind": "svg"}]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ webclaw https://docs.rust-lang.org --crawl --depth 2 --max-pages 50
|
|
||||||
|
|
||||||
Crawling... 50/50 pages extracted
|
|
||||||
---
|
|
||||||
# Page 1: https://docs.rust-lang.org/
|
|
||||||
...
|
|
||||||
# Page 2: https://docs.rust-lang.org/book/
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## MCP Server — 10 tools for AI agents
|
|
||||||
|
|
||||||
<a href="https://glama.ai/mcp/servers/0xMassi/webclaw"><img src="https://glama.ai/mcp/servers/0xMassi/webclaw/badge" alt="webclaw MCP server" /></a>
|
|
||||||
|
|
||||||
webclaw ships as an MCP server that plugs into Claude Desktop, Claude Code, Cursor, Windsurf, OpenCode, Antigravity, Codex CLI, and any MCP-compatible client.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npx create-webclaw # auto-detects and configures everything
|
|
||||||
```
|
|
||||||
|
|
||||||
Or manual setup — add to your Claude Desktop config:
|
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
|
@ -182,218 +180,312 @@ Or manual setup — add to your Claude Desktop config:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Then in Claude: *"Scrape the top 5 results for 'web scraping tools' and compare their pricing"* — it just works.
|
Then ask your agent things like:
|
||||||
|
|
||||||
### Available tools
|
```text
|
||||||
|
Scrape these competitor pricing pages and summarize the differences.
|
||||||
| Tool | Description | Requires API key? |
|
|
||||||
|------|-------------|:-:|
|
|
||||||
| `scrape` | Extract content from any URL | No |
|
|
||||||
| `crawl` | Recursive site crawl | No |
|
|
||||||
| `map` | Discover URLs from sitemaps | No |
|
|
||||||
| `batch` | Parallel multi-URL extraction | No |
|
|
||||||
| `extract` | LLM-powered structured extraction | No (needs Ollama) |
|
|
||||||
| `summarize` | Page summarization | No (needs Ollama) |
|
|
||||||
| `diff` | Content change detection | No |
|
|
||||||
| `brand` | Brand identity extraction | No |
|
|
||||||
| `search` | Web search + scrape results | Yes |
|
|
||||||
| `research` | Deep multi-source research | Yes |
|
|
||||||
|
|
||||||
8 of 10 tools work locally — no account, no API key, fully private.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
### Extraction
|
|
||||||
|
|
||||||
- **Readability scoring** — multi-signal content detection (text density, semantic tags, link ratio)
|
|
||||||
- **Noise filtering** — strips nav, footer, ads, modals, cookie banners (Tailwind-safe)
|
|
||||||
- **Data island extraction** — catches React/Next.js JSON payloads, JSON-LD, hydration data
|
|
||||||
- **YouTube metadata** — structured data from any YouTube video
|
|
||||||
- **PDF extraction** — auto-detected via Content-Type
|
|
||||||
- **5 output formats** — markdown, text, JSON, LLM-optimized, HTML
|
|
||||||
|
|
||||||
### Content control
|
|
||||||
|
|
||||||
```bash
|
|
||||||
webclaw URL --include "article, .content" # CSS selector include
|
|
||||||
webclaw URL --exclude "nav, footer, .sidebar" # CSS selector exclude
|
|
||||||
webclaw URL --only-main-content # Auto-detect main content
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Crawling
|
```text
|
||||||
|
Crawl this documentation site and prepare clean context for a RAG index.
|
||||||
```bash
|
|
||||||
webclaw URL --crawl --depth 3 --max-pages 100 # BFS same-origin crawl
|
|
||||||
webclaw URL --crawl --sitemap # Seed from sitemap
|
|
||||||
webclaw URL --map # Discover URLs only
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### LLM features (Ollama / OpenAI / Anthropic)
|
```text
|
||||||
|
Extract the brand colors, fonts, and logos from this company website.
|
||||||
```bash
|
|
||||||
webclaw URL --summarize # Page summary
|
|
||||||
webclaw URL --extract-prompt "Get all prices" # Natural language extraction
|
|
||||||
webclaw URL --extract-json '{"type":"object"}' # Schema-enforced extraction
|
|
||||||
```
|
|
||||||
|
|
||||||
### Change tracking
|
|
||||||
|
|
||||||
```bash
|
|
||||||
webclaw URL -f json > snap.json # Take snapshot
|
|
||||||
webclaw URL --diff-with snap.json # Compare later
|
|
||||||
```
|
|
||||||
|
|
||||||
### Brand extraction
|
|
||||||
|
|
||||||
```bash
|
|
||||||
webclaw URL --brand # Colors, fonts, logos, OG image
|
|
||||||
```
|
|
||||||
|
|
||||||
### Proxy rotation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
webclaw URL --proxy http://user:pass@host:port # Single proxy
|
|
||||||
webclaw URLs --proxy-file proxies.txt # Pool rotation
|
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Benchmarks
|
## Tools
|
||||||
|
|
||||||
All numbers from real tests on 50 diverse pages. See [benchmarks/](benchmarks/) for methodology and reproduction instructions.
|
| Tool | What it does | Local |
|
||||||
|
| --- | --- | :-: |
|
||||||
### Extraction quality
|
| `scrape` | Extract one URL as markdown, text, JSON, LLM format, or HTML | Yes |
|
||||||
|
| `crawl` | Follow same-origin links and extract discovered pages | Yes |
|
||||||
```
|
| `map` | Discover URLs without extracting every page | Yes |
|
||||||
Accuracy webclaw ███████████████████ 95.1%
|
| `batch` | Scrape multiple URLs in parallel | Yes |
|
||||||
readability ████████████████▋ 83.5%
|
| `extract` | Convert page content into structured data | Yes, with local or configured LLM |
|
||||||
trafilatura ████████████████ 80.6%
|
| `summarize` | Summarize a page | Yes, with local or configured LLM |
|
||||||
newspaper3k █████████████▎ 66.4%
|
| `diff` | Compare page content snapshots | Yes |
|
||||||
|
| `brand` | Extract colors, fonts, logos, and metadata | Yes |
|
||||||
Noise removal webclaw ███████████████████ 96.1%
|
| `search` | Search the web and scrape results | Hosted API |
|
||||||
readability █████████████████▊ 89.4%
|
| `research` | Multi-source research workflow | Hosted API |
|
||||||
trafilatura ██████████████████▏ 91.2%
|
|
||||||
newspaper3k ███████████████▎ 76.8%
|
|
||||||
```
|
|
||||||
|
|
||||||
### Speed (pure extraction, no network)
|
|
||||||
|
|
||||||
```
|
|
||||||
10KB page webclaw ██ 0.8ms
|
|
||||||
readability █████ 2.1ms
|
|
||||||
trafilatura ██████████ 4.3ms
|
|
||||||
|
|
||||||
100KB page webclaw ██ 3.2ms
|
|
||||||
readability █████ 8.7ms
|
|
||||||
trafilatura ██████████ 18.4ms
|
|
||||||
```
|
|
||||||
|
|
||||||
### Token efficiency (feeding to Claude/GPT)
|
|
||||||
|
|
||||||
| Format | Tokens | vs Raw HTML |
|
|
||||||
|--------|:------:|:-----------:|
|
|
||||||
| Raw HTML | 4,820 | baseline |
|
|
||||||
| readability | 2,340 | -51% |
|
|
||||||
| trafilatura | 2,180 | -55% |
|
|
||||||
| **webclaw llm** | **1,590** | **-67%** |
|
|
||||||
|
|
||||||
### Crawl speed
|
|
||||||
|
|
||||||
| Concurrency | webclaw | Crawl4AI | Scrapy |
|
|
||||||
|:-----------:|:-------:|:--------:|:------:|
|
|
||||||
| 5 | **9.8 pg/s** | 5.2 pg/s | 7.1 pg/s |
|
|
||||||
| 10 | **18.4 pg/s** | 8.7 pg/s | 12.3 pg/s |
|
|
||||||
| 20 | **32.1 pg/s** | 14.2 pg/s | 21.8 pg/s |
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## SDKs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install @webclaw/sdk
|
||||||
|
pip install webclaw
|
||||||
|
go get github.com/0xMassi/webclaw-go
|
||||||
|
```
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>TypeScript</summary>
|
||||||
|
|
||||||
|
```ts
|
||||||
|
import { Webclaw } from "@webclaw/sdk";
|
||||||
|
|
||||||
|
const client = new Webclaw({ apiKey: process.env.WEBCLAW_API_KEY! });
|
||||||
|
|
||||||
|
const page = await client.scrape({
|
||||||
|
url: "https://example.com",
|
||||||
|
formats: ["markdown"],
|
||||||
|
only_main_content: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(page.markdown);
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Python</summary>
|
||||||
|
|
||||||
|
```python
|
||||||
|
from webclaw import Webclaw
|
||||||
|
|
||||||
|
client = Webclaw(api_key="wc_your_key")
|
||||||
|
|
||||||
|
page = client.scrape(
|
||||||
|
"https://example.com",
|
||||||
|
formats=["markdown"],
|
||||||
|
only_main_content=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(page.markdown)
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>cURL</summary>
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST https://api.webclaw.io/v1/scrape \
|
||||||
|
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"url": "https://example.com",
|
||||||
|
"formats": ["markdown"],
|
||||||
|
"only_main_content": true
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Output Formats
|
||||||
|
|
||||||
|
| Format | Use it when you need |
|
||||||
|
| --- | --- |
|
||||||
|
| `markdown` | Clean page content with structure preserved |
|
||||||
|
| `llm` | Compact context for agents and RAG pipelines |
|
||||||
|
| `text` | Plain text with minimal formatting |
|
||||||
|
| `json` | Structured metadata, links, images, and extracted fields |
|
||||||
|
| `html` | Cleaned HTML for custom processing |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Local First, Hosted When Needed
|
||||||
|
|
||||||
|
The CLI and MCP server work locally without an account for the core extraction path.
|
||||||
|
|
||||||
|
Use the hosted API at [webclaw.io](https://webclaw.io) when you need:
|
||||||
|
|
||||||
|
- protected-site access without managing infrastructure
|
||||||
|
- JavaScript rendering
|
||||||
|
- async crawl and research jobs
|
||||||
|
- web search
|
||||||
|
- watches and production usage tracking
|
||||||
|
- SDKs for application code
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export WEBCLAW_API_KEY=wc_your_key
|
||||||
|
|
||||||
|
webclaw https://example.com --cloud
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What You Can Build
|
||||||
|
|
||||||
|
| Use case | Example |
|
||||||
|
| --- | --- |
|
||||||
|
| AI agent web access | Give Claude, Cursor, or another MCP client clean page context |
|
||||||
|
| RAG ingestion | Crawl docs, help centers, blogs, and knowledge bases |
|
||||||
|
| Competitor monitoring | Track pricing pages, changelogs, docs, and product pages |
|
||||||
|
| Structured extraction | Turn messy pages into typed JSON for automations |
|
||||||
|
| Research workflows | Search, scrape, summarize, and cite multiple sources |
|
||||||
|
| Brand intelligence | Extract logos, colors, fonts, and social metadata |
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
```
|
```text
|
||||||
webclaw/
|
webclaw/
|
||||||
crates/
|
crates/
|
||||||
webclaw-core Pure extraction engine. Zero network deps. WASM-safe.
|
webclaw-core HTML to markdown, text, JSON, and LLM-ready output
|
||||||
webclaw-fetch HTTP client + TLS fingerprinting (wreq/BoringSSL). Crawler. Batch ops.
|
webclaw-fetch Fetching, crawling, batching, and mapping
|
||||||
webclaw-llm LLM provider chain (Ollama -> OpenAI -> Anthropic)
|
webclaw-llm Local and hosted LLM provider support
|
||||||
webclaw-pdf PDF text extraction
|
webclaw-pdf PDF text extraction
|
||||||
webclaw-mcp MCP server (10 tools for AI agents)
|
webclaw-mcp MCP server for AI agents
|
||||||
webclaw-cli CLI binary
|
webclaw-cli Command-line interface
|
||||||
```
|
```
|
||||||
|
|
||||||
`webclaw-core` takes raw HTML as a `&str` and returns structured output. No I/O, no network, no allocator tricks. Can compile to WASM.
|
`webclaw-core` is pure extraction logic: no network I/O, small surface area, and usable independently from the fetching layer.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
| Variable | Description |
|
| Variable | Description |
|
||||||
|----------|-------------|
|
| --- | --- |
|
||||||
| `WEBCLAW_API_KEY` | Cloud API key (enables bot bypass, JS rendering, search, research) |
|
| `WEBCLAW_API_KEY` | Hosted API key |
|
||||||
| `OLLAMA_HOST` | Ollama URL for local LLM features (default: `http://localhost:11434`) |
|
| `OLLAMA_HOST` | Ollama URL for local LLM features |
|
||||||
| `OPENAI_API_KEY` | OpenAI API key for LLM features |
|
| `OPENAI_API_KEY` | OpenAI-compatible LLM provider key |
|
||||||
| `ANTHROPIC_API_KEY` | Anthropic API key for LLM features |
|
| `OPENAI_BASE_URL` | OpenAI-compatible base URL |
|
||||||
|
| `ANTHROPIC_API_KEY` | Anthropic-compatible LLM provider key |
|
||||||
|
| `ANTHROPIC_BASE_URL` | Anthropic-compatible base URL |
|
||||||
| `WEBCLAW_PROXY` | Single proxy URL |
|
| `WEBCLAW_PROXY` | Single proxy URL |
|
||||||
| `WEBCLAW_PROXY_FILE` | Path to proxy pool file |
|
| `WEBCLAW_PROXY_FILE` | Proxy pool file |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Cloud API (optional)
|
|
||||||
|
|
||||||
For bot-protected sites, JS rendering, and advanced features, webclaw offers a hosted API at [webclaw.io](https://webclaw.io).
|
|
||||||
|
|
||||||
The CLI and MCP server work locally first. Cloud is used as a fallback when:
|
|
||||||
- A site has bot protection (Cloudflare, DataDome, WAF)
|
|
||||||
- A page requires JavaScript rendering
|
|
||||||
- You use search or research tools
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export WEBCLAW_API_KEY=wc_your_key
|
|
||||||
|
|
||||||
# Automatic: tries local first, cloud on bot detection
|
|
||||||
webclaw https://protected-site.com
|
|
||||||
|
|
||||||
# Force cloud
|
|
||||||
webclaw --cloud https://spa-site.com
|
|
||||||
```
|
|
||||||
|
|
||||||
### SDKs
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm install @webclaw/sdk # TypeScript/JavaScript
|
|
||||||
pip install webclaw # Python
|
|
||||||
go get github.com/0xMassi/webclaw-go # Go
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Use cases
|
|
||||||
|
|
||||||
- **AI agents** — Give Claude/Cursor/GPT real-time web access via MCP
|
|
||||||
- **Research** — Crawl documentation, competitor sites, news archives
|
|
||||||
- **Price monitoring** — Track changes with `--diff-with` snapshots
|
|
||||||
- **Training data** — Prepare web content for fine-tuning with token-optimized output
|
|
||||||
- **Content pipelines** — Batch extract + summarize in CI/CD
|
|
||||||
- **Brand intelligence** — Extract visual identity from any website
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Community
|
|
||||||
|
|
||||||
- [Discord](https://discord.gg/KDfd48EpnW) — questions, feedback, show what you built
|
|
||||||
- [GitHub Issues](https://github.com/0xMassi/webclaw/issues) — bug reports and feature requests
|
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
The most useful contributions right now are practical and small:
|
||||||
|
|
||||||
|
- add examples for real agent and RAG workflows
|
||||||
|
- improve SDK snippets
|
||||||
|
- report pages that extract poorly
|
||||||
|
- add failing fixtures for messy HTML
|
||||||
|
- improve docs for MCP clients and local setup
|
||||||
|
- test the CLI on more Linux/macOS environments
|
||||||
|
|
||||||
|
Good first places to start:
|
||||||
|
|
||||||
- [Good first issues](https://github.com/0xMassi/webclaw/issues?q=label%3A%22good+first+issue%22)
|
- [Good first issues](https://github.com/0xMassi/webclaw/issues?q=label%3A%22good+first+issue%22)
|
||||||
- [Architecture docs](CONTRIBUTING.md#architecture)
|
- [Open a bug report](https://github.com/0xMassi/webclaw/issues/new)
|
||||||
|
- [Start a discussion](https://github.com/0xMassi/webclaw/discussions)
|
||||||
|
|
||||||
## Acknowledgments
|
If a page extracts badly, include:
|
||||||
|
|
||||||
TLS and HTTP/2 browser fingerprinting is powered by [wreq](https://github.com/0x676e67/wreq) and [http2](https://github.com/0x676e67/http2) by [@0x676e67](https://github.com/0x676e67), who pioneered browser-grade HTTP/2 fingerprinting in Rust.
|
```text
|
||||||
|
URL:
|
||||||
|
Command or API request:
|
||||||
|
Expected output:
|
||||||
|
Actual output:
|
||||||
|
Format used: markdown / llm / text / json / html
|
||||||
|
CLI, MCP, SDK, or API:
|
||||||
|
```
|
||||||
|
|
||||||
|
Please remove secrets, cookies, private tokens, and customer data from logs before posting.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Infrastructure Partner
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<td align="center">
|
||||||
|
<a href="https://coldproxy.com/">
|
||||||
|
<img src="./assets/sponsors/coldproxy-banner.png" alt="ColdProxy" width="720" />
|
||||||
|
</a>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>
|
||||||
|
<strong>ColdProxy</strong> supports webclaw as an Infrastructure Partner, providing residential IPv4,
|
||||||
|
residential IPv6, and datacenter IPv6 proxy infrastructure across 195+ countries for public data
|
||||||
|
collection, regional testing, monitoring, and web scraping workflows. Explore
|
||||||
|
<a href="https://coldproxy.com/">ColdProxy</a>'s latest plans and available offers directly on the website.
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Studio Partners
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<td width="340" align="center">
|
||||||
|
<a href="https://quantumproxies.net/?utm_source=webclaw&utm_medium=github&utm_campaign=sponsor">
|
||||||
|
<img src="./assets/sponsors/quantum-proxies-banner.png" alt="Quantum Proxies" width="300" />
|
||||||
|
</a>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<strong>Quantum Proxies</strong> provides fast, reliable residential and ISP proxy infrastructure for developers running large-scale extraction workloads.
|
||||||
|
Get 20% off any plan with code <code>WEBCLAW20</code> at
|
||||||
|
<a href="https://quantumproxies.net/?utm_source=webclaw&utm_medium=github&utm_campaign=sponsor">quantumproxies.net</a>.
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td width="340" align="center">
|
||||||
|
<a href="https://proxy-seller.com/?partner=KXMQNNLIGHXR4B">
|
||||||
|
<img src="./assets/sponsors/proxy-seller-banner.png" alt="Proxy-Seller" width="300" />
|
||||||
|
</a>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<strong>Proxy-Seller</strong> maintains a global network of residential and datacenter proxies optimized for web extraction at scale.
|
||||||
|
The service supports high-volume concurrent scraping, geographic rotation, and integration with web extraction tools.
|
||||||
|
Use code <code>WBC15</code> for 15% off IPv4, IPv6, ISP, and Residential proxies, and 10% off Mobile at
|
||||||
|
<a href="https://proxy-seller.com/?partner=KXMQNNLIGHXR4B">proxy-seller.com</a>.
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td width="340" align="center">
|
||||||
|
<a href="https://www.rapidproxy.io/?ref=webclaw">
|
||||||
|
<img src="./assets/sponsors/rapidproxy-banner.png" alt="RapidProxy" width="300" />
|
||||||
|
</a>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<strong>RapidProxy</strong> delivers fast, reliable proxy infrastructure for large-scale data collection.
|
||||||
|
With 90M+ residential IPs, smart rotation, high concurrency, AI-powered CAPTCHA bypass, and non-expiring traffic, it helps keep scraping workflows stable at scale.
|
||||||
|
Use code <code>webclaw</code> for 10% off, or
|
||||||
|
<a href="https://www.rapidproxy.io/?ref=webclaw">Try it free</a>.
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Community Plugins
|
||||||
|
|
||||||
|
Third-party plugins that integrate webclaw with AI agent platforms:
|
||||||
|
|
||||||
|
| Plugin | Platform | What it does |
|
||||||
|
|---|---|---|
|
||||||
|
| [openclaw-webclaw](https://github.com/jal-co/openclaw-webclaw) | [OpenClaw](https://openclaw.ai) | Native webclaw v1 API plugin with 9 tools: scrape, search, crawl, extract, summarize, diff, map, batch, brand |
|
||||||
|
| [hermes-webclaw](https://github.com/jal-co/hermes-webclaw) | [Hermes Agent](https://github.com/NousResearch/hermes-agent) | Web search provider and 9 dedicated tools for the full v1 API surface. Install with `hermes plugins install jal-co/hermes-webclaw` |
|
||||||
|
|
||||||
|
Built a webclaw integration? [Open a PR](https://github.com/0xMassi/webclaw/pulls) to add it here.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Contributors
|
||||||
|
|
||||||
|
Thanks to everyone improving webclaw through issues, examples, docs, bug reports, and pull requests.
|
||||||
|
|
||||||
|
<a href="https://github.com/0xMassi/webclaw/graphs/contributors">
|
||||||
|
<img src="https://contrib.rocks/image?repo=0xMassi/webclaw" alt="webclaw contributors" />
|
||||||
|
</a>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Star History
|
||||||
|
|
||||||
|
<a href="https://www.star-history.com/?repos=0xMassi%2Fwebclaw&type=date&legend=top-left">
|
||||||
|
<picture>
|
||||||
|
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/chart?repos=0xMassi/webclaw&type=date&theme=dark&legend=top-left" />
|
||||||
|
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/chart?repos=0xMassi/webclaw&type=date&legend=top-left" />
|
||||||
|
<img alt="Star History Chart" src="https://api.star-history.com/chart?repos=0xMassi/webclaw&type=date&legend=top-left" />
|
||||||
|
</picture>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
|
|
|
||||||
BIN
assets/sponsors/coldproxy-banner.png
Normal file
BIN
assets/sponsors/coldproxy-banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.3 MiB |
BIN
assets/sponsors/coldproxy-logo.png
Normal file
BIN
assets/sponsors/coldproxy-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 757 KiB |
BIN
assets/sponsors/proxy-seller-banner.png
Normal file
BIN
assets/sponsors/proxy-seller-banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 26 KiB |
BIN
assets/sponsors/quantum-proxies-banner.png
Normal file
BIN
assets/sponsors/quantum-proxies-banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 21 KiB |
BIN
assets/sponsors/quantum-proxies.png
Normal file
BIN
assets/sponsors/quantum-proxies.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 5.8 KiB |
BIN
assets/sponsors/rapidproxy-banner.png
Normal file
BIN
assets/sponsors/rapidproxy-banner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 413 KiB |
|
|
@ -35,18 +35,49 @@ const ANTIBOT_TITLES: &[&str] = &[
|
||||||
"ddos protection",
|
"ddos protection",
|
||||||
];
|
];
|
||||||
|
|
||||||
/// Detect why a page returned empty content.
|
/// URL host/path fragments that indicate a GDPR/cookie consent redirect.
|
||||||
|
const CONSENT_URL_FRAGMENTS: &[&str] = &[
|
||||||
|
"://consent.",
|
||||||
|
"/consent?",
|
||||||
|
"/consent/",
|
||||||
|
"collectconsent",
|
||||||
|
"consentcheck",
|
||||||
|
"/cmp/",
|
||||||
|
"guce.advertising.com",
|
||||||
|
];
|
||||||
|
|
||||||
|
/// English consent-wall title prefixes. Many providers localize this page, so
|
||||||
|
/// this is a best-effort secondary signal. URL shape is the primary signal.
|
||||||
|
const CONSENT_TITLES: &[&str] = &[
|
||||||
|
"before you continue",
|
||||||
|
"your privacy choices",
|
||||||
|
"we value your privacy",
|
||||||
|
"we care about your privacy",
|
||||||
|
"cookie consent",
|
||||||
|
"consent required",
|
||||||
|
];
|
||||||
|
|
||||||
|
/// Detect why a page returned empty or near-empty content.
|
||||||
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
enum EmptyReason {
|
enum EmptyReason {
|
||||||
/// Anti-bot challenge page (Cloudflare, Akamai, etc.)
|
/// Anti-bot challenge page (Cloudflare, Akamai, etc.)
|
||||||
Antibot,
|
Antibot,
|
||||||
|
/// GDPR/cookie consent redirect.
|
||||||
|
ConsentWall,
|
||||||
/// JS-only SPA that returns an empty shell without a browser
|
/// JS-only SPA that returns an empty shell without a browser
|
||||||
JsRequired,
|
JsRequired,
|
||||||
/// Page has content — not empty
|
/// Page has content.
|
||||||
None,
|
None,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn detect_empty(result: &ExtractionResult) -> EmptyReason {
|
fn detect_empty(result: &ExtractionResult) -> EmptyReason {
|
||||||
// Has real content — nothing to warn about
|
// Consent walls can have a tiny body, so check before the content
|
||||||
|
// short-circuit.
|
||||||
|
if is_consent_wall(result) {
|
||||||
|
return EmptyReason::ConsentWall;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Has real content. Nothing to warn about.
|
||||||
if result.metadata.word_count > 50 || !result.content.markdown.is_empty() {
|
if result.metadata.word_count > 50 || !result.content.markdown.is_empty() {
|
||||||
return EmptyReason::None;
|
return EmptyReason::None;
|
||||||
}
|
}
|
||||||
|
|
@ -67,6 +98,35 @@ fn detect_empty(result: &ExtractionResult) -> EmptyReason {
|
||||||
EmptyReason::None
|
EmptyReason::None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A consent wall is identified by either:
|
||||||
|
/// 1. The final URL pointing at a known consent host/path, or
|
||||||
|
/// 2. A consent-wall title prefix with a very small body.
|
||||||
|
fn is_consent_wall(result: &ExtractionResult) -> bool {
|
||||||
|
if let Some(ref url) = result.metadata.url {
|
||||||
|
let lower = url.to_ascii_lowercase();
|
||||||
|
if CONSENT_URL_FRAGMENTS
|
||||||
|
.iter()
|
||||||
|
.any(|fragment| lower.contains(fragment))
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.metadata.word_count <= 50
|
||||||
|
&& let Some(ref title) = result.metadata.title
|
||||||
|
{
|
||||||
|
let lower = title.to_lowercase();
|
||||||
|
if CONSENT_TITLES
|
||||||
|
.iter()
|
||||||
|
.any(|prefix| lower.starts_with(prefix))
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
fn warn_empty(url: &str, reason: &EmptyReason) {
|
fn warn_empty(url: &str, reason: &EmptyReason) {
|
||||||
match reason {
|
match reason {
|
||||||
EmptyReason::Antibot => eprintln!(
|
EmptyReason::Antibot => eprintln!(
|
||||||
|
|
@ -74,6 +134,12 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
|
||||||
This site requires CAPTCHA solving or browser rendering.\n\
|
This site requires CAPTCHA solving or browser rendering.\n\
|
||||||
Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing"
|
Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing"
|
||||||
),
|
),
|
||||||
|
EmptyReason::ConsentWall => eprintln!(
|
||||||
|
"\x1b[33mwarning:\x1b[0m GDPR/cookie consent wall detected on {url}\n\
|
||||||
|
The site redirected to a consent page and returned no usable content.\n\
|
||||||
|
Try a different region via --proxy, or pass a pre-accepted consent cookie\n\
|
||||||
|
via --cookie / --cookie-file."
|
||||||
|
),
|
||||||
EmptyReason::JsRequired => eprintln!(
|
EmptyReason::JsRequired => eprintln!(
|
||||||
"\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\
|
"\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\
|
||||||
This site requires JavaScript rendering (SPA).\n\
|
This site requires JavaScript rendering (SPA).\n\
|
||||||
|
|
@ -260,7 +326,7 @@ struct Cli {
|
||||||
#[arg(long, env = "WEBCLAW_LLM_MODEL")]
|
#[arg(long, env = "WEBCLAW_LLM_MODEL")]
|
||||||
llm_model: Option<String>,
|
llm_model: Option<String>,
|
||||||
|
|
||||||
/// Override the LLM base URL (Ollama or OpenAI-compatible)
|
/// Override the LLM base URL (Ollama, OpenAI-compatible, or Anthropic-compatible)
|
||||||
#[arg(long, env = "WEBCLAW_LLM_BASE_URL")]
|
#[arg(long, env = "WEBCLAW_LLM_BASE_URL")]
|
||||||
llm_base_url: Option<String>,
|
llm_base_url: Option<String>,
|
||||||
|
|
||||||
|
|
@ -308,6 +374,34 @@ enum Commands {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
facts: Option<PathBuf>,
|
facts: Option<PathBuf>,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/// List all vertical extractors in the catalog.
|
||||||
|
///
|
||||||
|
/// Each entry has a stable `name` (usable with `webclaw vertical <name>`),
|
||||||
|
/// a human-friendly label, a one-line description, and the URL
|
||||||
|
/// patterns it claims. The same data is served by `/v1/extractors`
|
||||||
|
/// when running the REST API.
|
||||||
|
Extractors {
|
||||||
|
/// Emit JSON instead of a human-friendly table.
|
||||||
|
#[arg(long)]
|
||||||
|
json: bool,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Run a vertical extractor by name. Returns typed JSON with fields
|
||||||
|
/// specific to the target site (title, price, author, rating, etc.)
|
||||||
|
/// rather than generic markdown.
|
||||||
|
///
|
||||||
|
/// Use `webclaw extractors` to see the full list. Example:
|
||||||
|
/// `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/`.
|
||||||
|
Vertical {
|
||||||
|
/// Vertical name (e.g. `reddit`, `github_repo`, `trustpilot_reviews`).
|
||||||
|
name: String,
|
||||||
|
/// URL to extract.
|
||||||
|
url: String,
|
||||||
|
/// Emit compact JSON (single line). Default is pretty-printed.
|
||||||
|
#[arg(long)]
|
||||||
|
raw: bool,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, ValueEnum)]
|
#[derive(Clone, ValueEnum)]
|
||||||
|
|
@ -323,6 +417,9 @@ enum OutputFormat {
|
||||||
enum Browser {
|
enum Browser {
|
||||||
Chrome,
|
Chrome,
|
||||||
Firefox,
|
Firefox,
|
||||||
|
/// Safari iOS 26. Pair with a country-matched residential proxy for sites
|
||||||
|
/// that reject non-mobile profiles.
|
||||||
|
SafariIos,
|
||||||
Random,
|
Random,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -349,16 +446,21 @@ impl From<Browser> for BrowserProfile {
|
||||||
match b {
|
match b {
|
||||||
Browser::Chrome => BrowserProfile::Chrome,
|
Browser::Chrome => BrowserProfile::Chrome,
|
||||||
Browser::Firefox => BrowserProfile::Firefox,
|
Browser::Firefox => BrowserProfile::Firefox,
|
||||||
|
Browser::SafariIos => BrowserProfile::SafariIos,
|
||||||
Browser::Random => BrowserProfile::Random,
|
Browser::Random => BrowserProfile::Random,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn init_logging(verbose: bool) {
|
fn init_logging(verbose: bool) {
|
||||||
|
// html5ever / markup5ever / selectors emit WARN on common real-world HTML
|
||||||
|
// quirks. They are rarely actionable for CLI users, so keep them quiet by
|
||||||
|
// default while still allowing WEBCLAW_LOG to override the filter.
|
||||||
|
let default = "warn,html5ever=error,markup5ever=error,selectors=error";
|
||||||
let filter = if verbose {
|
let filter = if verbose {
|
||||||
EnvFilter::new("webclaw=debug")
|
EnvFilter::new("webclaw=debug,html5ever=error,markup5ever=error,selectors=error")
|
||||||
} else {
|
} else {
|
||||||
EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new("warn"))
|
EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new(default))
|
||||||
};
|
};
|
||||||
|
|
||||||
tracing_subscriber::fmt().with_env_filter(filter).init();
|
tracing_subscriber::fmt().with_env_filter(filter).init();
|
||||||
|
|
@ -511,7 +613,15 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
|
||||||
Err(_) => (String::new(), String::new(), None),
|
Err(_) => (String::new(), String::new(), None),
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut stem = path.trim_matches('/').to_string();
|
// Drop empty / "." / ".." path segments so a URL path like
|
||||||
|
// `/../../etc/passwd` can't climb out of the output directory.
|
||||||
|
let cleaned_path: String = path
|
||||||
|
.split('/')
|
||||||
|
.filter(|seg| !seg.is_empty() && *seg != "." && *seg != "..")
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("/");
|
||||||
|
|
||||||
|
let mut stem = cleaned_path;
|
||||||
if stem.is_empty() {
|
if stem.is_empty() {
|
||||||
// Use hostname for root URLs to avoid collisions in batch mode
|
// Use hostname for root URLs to avoid collisions in batch mode
|
||||||
let clean_host = host.strip_prefix("www.").unwrap_or(&host);
|
let clean_host = host.strip_prefix("www.").unwrap_or(&host);
|
||||||
|
|
@ -538,13 +648,59 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
|
||||||
format!("{sanitized}.{ext}")
|
format!("{sanitized}.{ext}")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reject a caller-supplied (CSV `url,filename`) name that could escape the
|
||||||
|
/// output directory: absolute paths, drive prefixes, root, or any `..`
|
||||||
|
/// component. Returns the validated relative path on success.
|
||||||
|
fn safe_relative_filename(filename: &str) -> Result<PathBuf, String> {
|
||||||
|
let candidate = Path::new(filename);
|
||||||
|
use std::path::Component;
|
||||||
|
for comp in candidate.components() {
|
||||||
|
match comp {
|
||||||
|
Component::Normal(_) | Component::CurDir => {}
|
||||||
|
Component::ParentDir => {
|
||||||
|
return Err(format!("refusing path with '..' component: {filename}"));
|
||||||
|
}
|
||||||
|
Component::RootDir | Component::Prefix(_) => {
|
||||||
|
return Err(format!("refusing absolute output path: {filename}"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if candidate.as_os_str().is_empty() {
|
||||||
|
return Err("empty output filename".to_string());
|
||||||
|
}
|
||||||
|
Ok(candidate.to_path_buf())
|
||||||
|
}
|
||||||
|
|
||||||
/// Write extraction output to a file inside `dir`, creating parent dirs as needed.
|
/// Write extraction output to a file inside `dir`, creating parent dirs as needed.
|
||||||
|
///
|
||||||
|
/// `filename` may originate from an attacker-controlled `--urls-file`
|
||||||
|
/// (`url,filename` CSV). It is validated for traversal, and the canonical
|
||||||
|
/// destination directory is asserted to stay under the canonical output
|
||||||
|
/// directory before any write.
|
||||||
fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> {
|
fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> {
|
||||||
let dest = dir.join(filename);
|
let rel = safe_relative_filename(filename)?;
|
||||||
|
let dest = dir.join(&rel);
|
||||||
|
|
||||||
|
std::fs::create_dir_all(dir)
|
||||||
|
.map_err(|e| format!("failed to create directory {}: {e}", dir.display()))?;
|
||||||
|
let base = dir
|
||||||
|
.canonicalize()
|
||||||
|
.map_err(|e| format!("failed to resolve output dir {}: {e}", dir.display()))?;
|
||||||
|
|
||||||
if let Some(parent) = dest.parent() {
|
if let Some(parent) = dest.parent() {
|
||||||
std::fs::create_dir_all(parent)
|
std::fs::create_dir_all(parent)
|
||||||
.map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?;
|
.map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?;
|
||||||
|
let canon_parent = parent
|
||||||
|
.canonicalize()
|
||||||
|
.map_err(|e| format!("failed to resolve {}: {e}", parent.display()))?;
|
||||||
|
if !canon_parent.starts_with(&base) {
|
||||||
|
return Err(format!(
|
||||||
|
"refusing to write outside output dir: {}",
|
||||||
|
dest.display()
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::fs::write(&dest, content)
|
std::fs::write(&dest, content)
|
||||||
.map_err(|e| format!("failed to write {}: {e}", dest.display()))?;
|
.map_err(|e| format!("failed to write {}: {e}", dest.display()))?;
|
||||||
let word_count = content.split_whitespace().count();
|
let word_count = content.split_whitespace().count();
|
||||||
|
|
@ -817,11 +973,18 @@ async fn enrich_html_with_stylesheets(html: &str, base_url: &str) -> String {
|
||||||
|
|
||||||
let client = reqwest::Client::builder()
|
let client = reqwest::Client::builder()
|
||||||
.timeout(std::time::Duration::from_secs(5))
|
.timeout(std::time::Duration::from_secs(5))
|
||||||
|
.redirect(reqwest::redirect::Policy::none())
|
||||||
.build()
|
.build()
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|
||||||
let mut extra_css = String::new();
|
let mut extra_css = String::new();
|
||||||
for href in &hrefs {
|
for href in &hrefs {
|
||||||
|
if webclaw_fetch::url_security::validate_public_http_url(href)
|
||||||
|
.await
|
||||||
|
.is_err()
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if let Ok(resp) = client.get(href).send().await
|
if let Ok(resp) = client.get(href).send().await
|
||||||
&& resp.status().is_success()
|
&& resp.status().is_success()
|
||||||
&& let Ok(body) = resp.text().await
|
&& let Ok(body) = resp.text().await
|
||||||
|
|
@ -1570,6 +1733,13 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
||||||
serde_json::to_string(payload).unwrap_or_default()
|
serde_json::to_string(payload).unwrap_or_default()
|
||||||
};
|
};
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
|
// SSRF guard: a webhook URL is user-supplied and otherwise bypasses
|
||||||
|
// the fetch-layer protections, so resolve + reject private/internal
|
||||||
|
// destinations before sending the payload.
|
||||||
|
if let Err(e) = webclaw_fetch::url_security::validate_public_http_url(&url).await {
|
||||||
|
eprintln!("[webhook] refusing unsafe URL: {e}");
|
||||||
|
return;
|
||||||
|
}
|
||||||
match reqwest::Client::builder()
|
match reqwest::Client::builder()
|
||||||
.timeout(std::time::Duration::from_secs(10))
|
.timeout(std::time::Duration::from_secs(10))
|
||||||
.build()
|
.build()
|
||||||
|
|
@ -1641,7 +1811,9 @@ async fn run_watch_single(
|
||||||
);
|
);
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
|
// Clamp to >=1s: `--watch-interval 0` would otherwise spin the
|
||||||
|
// fetch loop with zero delay and hammer the target.
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
|
||||||
|
|
||||||
if cancelled.load(Ordering::Relaxed) {
|
if cancelled.load(Ordering::Relaxed) {
|
||||||
eprintln!("[watch] Stopped");
|
eprintln!("[watch] Stopped");
|
||||||
|
|
@ -1733,7 +1905,9 @@ async fn run_watch_multi(
|
||||||
let mut check_number = 0u64;
|
let mut check_number = 0u64;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
|
// Clamp to >=1s: `--watch-interval 0` would otherwise spin the
|
||||||
|
// fetch loop with zero delay and hammer the target.
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
|
||||||
|
|
||||||
if cancelled.load(Ordering::Relaxed) {
|
if cancelled.load(Ordering::Relaxed) {
|
||||||
eprintln!("[watch] Stopped");
|
eprintln!("[watch] Stopped");
|
||||||
|
|
@ -1887,8 +2061,9 @@ async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
|
||||||
Ok(Box::new(provider))
|
Ok(Box::new(provider))
|
||||||
}
|
}
|
||||||
"anthropic" => {
|
"anthropic" => {
|
||||||
let provider = webclaw_llm::providers::anthropic::AnthropicProvider::new(
|
let provider = webclaw_llm::providers::anthropic::AnthropicProvider::with_base_url(
|
||||||
None,
|
None,
|
||||||
|
cli.llm_base_url.clone(),
|
||||||
cli.llm_model.clone(),
|
cli.llm_model.clone(),
|
||||||
)
|
)
|
||||||
.ok_or("ANTHROPIC_API_KEY not set")?;
|
.ok_or("ANTHROPIC_API_KEY not set")?;
|
||||||
|
|
@ -2211,7 +2386,9 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
.join("-")
|
.join("-")
|
||||||
.to_lowercase();
|
.to_lowercase();
|
||||||
let slug = if slug.len() > 50 { &slug[..50] } else { &slug };
|
// char-safe truncation: byte slicing panics if char 50
|
||||||
|
// lands mid-codepoint (multibyte queries).
|
||||||
|
let slug: String = slug.chars().take(50).collect();
|
||||||
let filename = format!("research-{slug}.json");
|
let filename = format!("research-{slug}.json");
|
||||||
|
|
||||||
let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default();
|
let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default();
|
||||||
|
|
@ -2288,6 +2465,83 @@ async fn main() {
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
Commands::Extractors { json } => {
|
||||||
|
let entries = webclaw_fetch::extractors::list();
|
||||||
|
if *json {
|
||||||
|
// Serialize with serde_json. ExtractorInfo derives
|
||||||
|
// Serialize so this is a one-liner.
|
||||||
|
match serde_json::to_string_pretty(&entries) {
|
||||||
|
Ok(s) => println!("{s}"),
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("error: failed to serialise catalog: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Human-friendly table: NAME + LABEL + one URL
|
||||||
|
// pattern sample. Keeps the output scannable on a
|
||||||
|
// narrow terminal.
|
||||||
|
println!("{} vertical extractors available:\n", entries.len());
|
||||||
|
let name_w = entries.iter().map(|e| e.name.len()).max().unwrap_or(0);
|
||||||
|
let label_w = entries.iter().map(|e| e.label.len()).max().unwrap_or(0);
|
||||||
|
for e in &entries {
|
||||||
|
let pattern_sample = e.url_patterns.first().copied().unwrap_or("");
|
||||||
|
println!(
|
||||||
|
" {:<nw$} {:<lw$} {}",
|
||||||
|
e.name,
|
||||||
|
e.label,
|
||||||
|
pattern_sample,
|
||||||
|
nw = name_w,
|
||||||
|
lw = label_w,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
println!("\nRun one: webclaw vertical <name> <url>");
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Commands::Vertical { name, url, raw } => {
|
||||||
|
// Build a FetchClient with cloud fallback attached when
|
||||||
|
// WEBCLAW_API_KEY is set. Antibot-gated verticals
|
||||||
|
// (amazon, ebay, etsy, trustpilot) need this to escalate
|
||||||
|
// on bot protection.
|
||||||
|
let fetch_cfg = webclaw_fetch::FetchConfig {
|
||||||
|
browser: webclaw_fetch::BrowserProfile::Firefox,
|
||||||
|
..webclaw_fetch::FetchConfig::default()
|
||||||
|
};
|
||||||
|
let mut client = match webclaw_fetch::FetchClient::new(fetch_cfg) {
|
||||||
|
Ok(c) => c,
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("error: failed to build fetch client: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if let Some(cloud) = webclaw_fetch::cloud::CloudClient::from_env() {
|
||||||
|
client = client.with_cloud(cloud);
|
||||||
|
}
|
||||||
|
match webclaw_fetch::extractors::dispatch_by_name(&client, name, url).await {
|
||||||
|
Ok(data) => {
|
||||||
|
let rendered = if *raw {
|
||||||
|
serde_json::to_string(&data)
|
||||||
|
} else {
|
||||||
|
serde_json::to_string_pretty(&data)
|
||||||
|
};
|
||||||
|
match rendered {
|
||||||
|
Ok(s) => println!("{s}"),
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("error: JSON encode failed: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// UrlMismatch / UnknownVertical / Fetch all get
|
||||||
|
// Display impls with actionable messages.
|
||||||
|
eprintln!("error: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2430,6 +2684,64 @@ async fn main() {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use webclaw_core::Content;
|
||||||
|
|
||||||
|
fn empty_result(title: Option<&str>, url: Option<&str>, markdown: &str) -> ExtractionResult {
|
||||||
|
ExtractionResult {
|
||||||
|
metadata: Metadata {
|
||||||
|
title: title.map(str::to_string),
|
||||||
|
description: None,
|
||||||
|
author: None,
|
||||||
|
published_date: None,
|
||||||
|
language: None,
|
||||||
|
url: url.map(str::to_string),
|
||||||
|
site_name: None,
|
||||||
|
image: None,
|
||||||
|
favicon: None,
|
||||||
|
word_count: markdown.split_whitespace().count(),
|
||||||
|
},
|
||||||
|
content: Content {
|
||||||
|
markdown: markdown.to_string(),
|
||||||
|
plain_text: markdown.to_string(),
|
||||||
|
links: vec![],
|
||||||
|
images: vec![],
|
||||||
|
code_blocks: vec![],
|
||||||
|
raw_html: None,
|
||||||
|
},
|
||||||
|
domain_data: None,
|
||||||
|
structured_data: vec![],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn detect_empty_identifies_consent_redirect_url() {
|
||||||
|
let result = empty_result(
|
||||||
|
Some("Yahoo"),
|
||||||
|
Some("https://guce.advertising.com/collectIdentifiers?sessionId=abc"),
|
||||||
|
"Continue",
|
||||||
|
);
|
||||||
|
assert_eq!(detect_empty(&result), EmptyReason::ConsentWall);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn detect_empty_identifies_short_consent_title() {
|
||||||
|
let result = empty_result(
|
||||||
|
Some("Before you continue"),
|
||||||
|
Some("https://www.google.com/"),
|
||||||
|
"Review privacy options",
|
||||||
|
);
|
||||||
|
assert_eq!(detect_empty(&result), EmptyReason::ConsentWall);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn detect_empty_does_not_flag_real_content_with_consent_words() {
|
||||||
|
let result = empty_result(
|
||||||
|
Some("Cookie consent patterns explained"),
|
||||||
|
Some("https://example.com/blog"),
|
||||||
|
"This article explains cookie consent patterns for product teams with enough real body text to be useful. It covers consent banners, privacy controls, analytics configuration, regional requirements, product tradeoffs, implementation details, testing flows, debugging notes, accessibility needs, and operational lessons from real teams shipping public websites across multiple markets. It also explains measurement, rollout planning, copy review, support workflows, design constraints, release notes, and how to keep privacy choices understandable for users.",
|
||||||
|
);
|
||||||
|
assert_eq!(detect_empty(&result), EmptyReason::None);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn url_to_filename_root() {
|
fn url_to_filename_root() {
|
||||||
|
|
@ -2528,4 +2840,66 @@ mod tests {
|
||||||
assert_eq!(content, "hello");
|
assert_eq!(content, "hello");
|
||||||
let _ = std::fs::remove_dir_all(&dir);
|
let _ = std::fs::remove_dir_all(&dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn url_to_filename_strips_traversal_segments() {
|
||||||
|
// `..` / `.` / empty path segments must not survive into the path.
|
||||||
|
let out = url_to_filename(
|
||||||
|
"https://example.com/../../etc/passwd",
|
||||||
|
&OutputFormat::Markdown,
|
||||||
|
);
|
||||||
|
assert!(!out.contains(".."), "traversal leaked: {out}");
|
||||||
|
assert_eq!(out, "etc/passwd.md");
|
||||||
|
let out2 = url_to_filename("https://example.com/a/./b//c", &OutputFormat::Json);
|
||||||
|
assert_eq!(out2, "a/b/c.json");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_relative_filename_rejects_escapes() {
|
||||||
|
assert!(safe_relative_filename("../escape.md").is_err());
|
||||||
|
assert!(safe_relative_filename("a/../../b.md").is_err());
|
||||||
|
assert!(safe_relative_filename("/etc/passwd").is_err());
|
||||||
|
assert!(safe_relative_filename("").is_err());
|
||||||
|
// Normal nested relative names stay allowed.
|
||||||
|
assert!(safe_relative_filename("nested/deep/file.md").is_ok());
|
||||||
|
assert!(safe_relative_filename("./ok.md").is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn write_to_file_refuses_traversal_filename() {
|
||||||
|
let dir = std::env::temp_dir().join("webclaw_test_traversal_dir");
|
||||||
|
let _ = std::fs::remove_dir_all(&dir);
|
||||||
|
// CSV-supplied `url,filename` traversal attempt.
|
||||||
|
let err = write_to_file(&dir, "../../tmp/webclaw_pwned.md", "x").unwrap_err();
|
||||||
|
assert!(err.contains("refusing"), "unexpected error: {err}");
|
||||||
|
assert!(
|
||||||
|
!std::path::Path::new("/tmp/webclaw_pwned.md").exists(),
|
||||||
|
"traversal write escaped the output dir"
|
||||||
|
);
|
||||||
|
let _ = std::fs::remove_dir_all(&dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn research_slug_truncation_is_char_safe() {
|
||||||
|
// Multibyte query: byte-slicing at 50 would panic mid-codepoint.
|
||||||
|
let query = "日本語".repeat(40); // 120 chars, 3 bytes each
|
||||||
|
let slug: String = query
|
||||||
|
.chars()
|
||||||
|
.map(|c| {
|
||||||
|
if c.is_alphanumeric() || c == ' ' {
|
||||||
|
c
|
||||||
|
} else {
|
||||||
|
' '
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<String>()
|
||||||
|
.split_whitespace()
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("-")
|
||||||
|
.to_lowercase();
|
||||||
|
let slug: String = slug.chars().take(50).collect();
|
||||||
|
assert!(slug.chars().count() <= 50);
|
||||||
|
// Round-trips through formatting without panicking.
|
||||||
|
let _ = format!("research-{slug}.json");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,10 @@ description = "Pure HTML content extraction engine for LLMs"
|
||||||
version.workspace = true
|
version.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
# Reddit regression fixtures are real old.reddit.com pages read at test time;
|
||||||
|
# they're large and only needed to run the test suite from the repo, so keep
|
||||||
|
# them out of the published crate.
|
||||||
|
exclude = ["testdata/reddit/*.html"]
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["quickjs"]
|
default = ["quickjs"]
|
||||||
|
|
@ -20,6 +24,11 @@ url = { version = "2", features = ["serde"] }
|
||||||
regex = "1"
|
regex = "1"
|
||||||
once_cell = "1"
|
once_cell = "1"
|
||||||
similar = "2"
|
similar = "2"
|
||||||
|
|
||||||
|
# rquickjs links a C library and cannot build for wasm32. Gating it per
|
||||||
|
# target keeps the `quickjs` feature usable on native while leaving the
|
||||||
|
# crate WASM-safe even with default features enabled.
|
||||||
|
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
||||||
rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }
|
rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
|
|
||||||
|
|
@ -79,9 +79,19 @@ static HSL_COLOR: Lazy<Regex> = Lazy::new(|| {
|
||||||
.unwrap()
|
.unwrap()
|
||||||
});
|
});
|
||||||
|
|
||||||
/// Matches font-family values
|
/// Matches the family tail of CSS `font:` shorthand after size/line-height.
|
||||||
static FONT_FAMILY: Lazy<Regex> =
|
static FONT_SHORTHAND_FAMILY: Lazy<Regex> = Lazy::new(|| {
|
||||||
Lazy::new(|| Regex::new(r"(?i)font-family\s*:\s*([^;}{]+)").unwrap());
|
Regex::new(
|
||||||
|
r#"(?ix)
|
||||||
|
(?:^|\s)
|
||||||
|
(?:xx-small|x-small|small|medium|large|x-large|xx-large|larger|smaller|\d*\.?\d+(?:px|rem|em|pt|pc|in|cm|mm|%|vw|vh|vmin|vmax))
|
||||||
|
(?:\s*/\s*[^\s,]+)?
|
||||||
|
\s+
|
||||||
|
(.+)$
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
macro_rules! selector {
|
macro_rules! selector {
|
||||||
($s:expr) => {{
|
($s:expr) => {{
|
||||||
|
|
@ -102,12 +112,12 @@ pub fn extract_brand(html: &str, url: Option<&str>) -> BrandIdentity {
|
||||||
let doc = Html::parse_document(html);
|
let doc = Html::parse_document(html);
|
||||||
let base_url = url.and_then(|u| Url::parse(u).ok());
|
let base_url = url.and_then(|u| Url::parse(u).ok());
|
||||||
|
|
||||||
|
let name = extract_brand_name(&doc);
|
||||||
let css_sources = collect_css(&doc);
|
let css_sources = collect_css(&doc);
|
||||||
let colors = extract_colors(&css_sources);
|
let colors = extract_colors(&css_sources, name.as_deref());
|
||||||
let fonts = extract_fonts(&css_sources);
|
let fonts = extract_fonts(&css_sources, name.as_deref());
|
||||||
let logo_url = find_logo(&doc, base_url.as_ref());
|
let logo_url = find_logo(&doc, base_url.as_ref());
|
||||||
let favicon_url = find_favicon(&doc, base_url.as_ref());
|
let favicon_url = find_favicon(&doc, base_url.as_ref());
|
||||||
let name = extract_brand_name(&doc);
|
|
||||||
let logos = find_all_logos(&doc, base_url.as_ref());
|
let logos = find_all_logos(&doc, base_url.as_ref());
|
||||||
let og_image = find_og_image(&doc, base_url.as_ref());
|
let og_image = find_og_image(&doc, base_url.as_ref());
|
||||||
|
|
||||||
|
|
@ -390,7 +400,7 @@ fn is_boring_color(hex: &str) -> bool {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_colors(decls: &[CssDecl]) -> Vec<BrandColor> {
|
fn extract_colors(decls: &[CssDecl], brand_name: Option<&str>) -> Vec<BrandColor> {
|
||||||
// Track (hex, usage) -> count
|
// Track (hex, usage) -> count
|
||||||
let mut counts: HashMap<String, HashMap<ColorUsage, usize>> = HashMap::new();
|
let mut counts: HashMap<String, HashMap<ColorUsage, usize>> = HashMap::new();
|
||||||
|
|
||||||
|
|
@ -429,6 +439,8 @@ fn extract_colors(decls: &[CssDecl]) -> Vec<BrandColor> {
|
||||||
// Sort by frequency (descending)
|
// Sort by frequency (descending)
|
||||||
colors.sort_by_key(|c| std::cmp::Reverse(c.count));
|
colors.sort_by_key(|c| std::cmp::Reverse(c.count));
|
||||||
|
|
||||||
|
demote_or_remove_oauth_palette(&mut colors, brand_name);
|
||||||
|
|
||||||
// Promote top non-white/black to Primary/Secondary if they're still Unknown
|
// Promote top non-white/black to Primary/Secondary if they're still Unknown
|
||||||
let mut assigned_primary = colors.iter().any(|c| c.usage == ColorUsage::Primary);
|
let mut assigned_primary = colors.iter().any(|c| c.usage == ColorUsage::Primary);
|
||||||
let mut assigned_secondary = colors.iter().any(|c| c.usage == ColorUsage::Secondary);
|
let mut assigned_secondary = colors.iter().any(|c| c.usage == ColorUsage::Secondary);
|
||||||
|
|
@ -450,6 +462,28 @@ fn extract_colors(decls: &[CssDecl]) -> Vec<BrandColor> {
|
||||||
colors
|
colors
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const GOOGLE_OAUTH_COLORS: &[&str] = &[
|
||||||
|
"#1A73E8", "#4285F4", "#34A853", "#FBBC05", "#EA4335", "#5F6368", "#202124", "#E8EAED",
|
||||||
|
"#F1F3F4",
|
||||||
|
];
|
||||||
|
|
||||||
|
fn demote_or_remove_oauth_palette(colors: &mut Vec<BrandColor>, brand_name: Option<&str>) {
|
||||||
|
let brand = brand_name.unwrap_or("").to_ascii_lowercase();
|
||||||
|
if brand.contains("google") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let google_hits = colors
|
||||||
|
.iter()
|
||||||
|
.filter(|c| GOOGLE_OAUTH_COLORS.contains(&c.hex.as_str()))
|
||||||
|
.count();
|
||||||
|
if google_hits < 3 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
colors.retain(|c| !GOOGLE_OAUTH_COLORS.contains(&c.hex.as_str()));
|
||||||
|
}
|
||||||
|
|
||||||
fn classify_color_property(property: &str) -> ColorUsage {
|
fn classify_color_property(property: &str) -> ColorUsage {
|
||||||
match property {
|
match property {
|
||||||
"background-color" | "background" => ColorUsage::Background,
|
"background-color" | "background" => ColorUsage::Background,
|
||||||
|
|
@ -584,31 +618,55 @@ const GENERIC_FONTS: &[&str] = &[
|
||||||
"initial",
|
"initial",
|
||||||
"unset",
|
"unset",
|
||||||
"revert",
|
"revert",
|
||||||
|
"arial",
|
||||||
|
"times",
|
||||||
|
"times new roman",
|
||||||
|
"courier new",
|
||||||
|
"georgia",
|
||||||
|
"menlo",
|
||||||
|
"monaco",
|
||||||
|
"consolas",
|
||||||
|
"liberation mono",
|
||||||
|
"sf mono",
|
||||||
|
"sfmono-regular",
|
||||||
|
"source code pro",
|
||||||
|
"apple color emoji",
|
||||||
|
"segoe ui",
|
||||||
|
"segoe ui emoji",
|
||||||
|
"segoe ui symbol",
|
||||||
|
"noto color emoji",
|
||||||
|
"blinkmacsystemfont",
|
||||||
|
"-apple-system",
|
||||||
];
|
];
|
||||||
|
|
||||||
fn extract_fonts(decls: &[CssDecl]) -> Vec<String> {
|
fn extract_fonts(decls: &[CssDecl], brand_name: Option<&str>) -> Vec<String> {
|
||||||
let mut freq: HashMap<String, usize> = HashMap::new();
|
let mut freq: HashMap<String, usize> = HashMap::new();
|
||||||
|
let brand = brand_name.unwrap_or("").to_ascii_lowercase();
|
||||||
|
|
||||||
for decl in decls {
|
for decl in decls {
|
||||||
if decl.property != "font-family" && decl.property != "font" {
|
if decl.property != "font-family" && decl.property != "font" {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// For shorthand `font:`, try to extract font-family portion
|
// For shorthand `font:`, extract only the family tail after the
|
||||||
|
// size/line-height token. The previous implementation treated values
|
||||||
|
// like `500 12px Roboto` as a font family, which polluted `/v1/brand`
|
||||||
|
// output with CSS declarations instead of usable family names.
|
||||||
let family_str = if decl.property == "font" {
|
let family_str = if decl.property == "font" {
|
||||||
// font shorthand: the font-family is the last part after the size.
|
match parse_font_shorthand_family(&decl.value) {
|
||||||
// Heuristic: take everything after a `/` or after `px`/`em`/`rem`/`%` + space
|
Some(family) => family,
|
||||||
FONT_FAMILY
|
None => continue,
|
||||||
.captures(&format!("font-family: {}", &decl.value))
|
}
|
||||||
.map(|c| c[1].to_string())
|
|
||||||
.unwrap_or_else(|| decl.value.clone())
|
|
||||||
} else {
|
} else {
|
||||||
decl.value.clone()
|
decl.value.clone()
|
||||||
};
|
};
|
||||||
|
|
||||||
for font in split_font_families(&family_str) {
|
for font in split_font_families(&family_str) {
|
||||||
let lower = font.to_lowercase();
|
let lower = font.to_lowercase();
|
||||||
if !GENERIC_FONTS.contains(&lower.as_str()) && !is_junk_font_name(&lower) {
|
if !GENERIC_FONTS.contains(&lower.as_str())
|
||||||
|
&& !is_junk_font_name(&lower)
|
||||||
|
&& !is_third_party_auth_font(&lower, &brand)
|
||||||
|
{
|
||||||
*freq.entry(font).or_insert(0) += 1;
|
*freq.entry(font).or_insert(0) += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -619,6 +677,32 @@ fn extract_fonts(decls: &[CssDecl]) -> Vec<String> {
|
||||||
fonts.into_iter().map(|(name, _)| name).collect()
|
fonts.into_iter().map(|(name, _)| name).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_third_party_auth_font(name: &str, brand_name: &str) -> bool {
|
||||||
|
!brand_name.contains("google") && name.contains("google sans")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_font_shorthand_family(value: &str) -> Option<String> {
|
||||||
|
let caps = FONT_SHORTHAND_FAMILY.captures(value)?;
|
||||||
|
let mut family = caps.get(1)?.as_str().trim().to_string();
|
||||||
|
|
||||||
|
// Drop the optional slash line-height residue if it was not consumed due
|
||||||
|
// to unusual whitespace, then leave comma-separated family names intact.
|
||||||
|
if let Some(stripped) = family.strip_prefix('/') {
|
||||||
|
family = stripped
|
||||||
|
.split_once(' ')
|
||||||
|
.map(|(_, rest)| rest)
|
||||||
|
.unwrap_or("")
|
||||||
|
.trim()
|
||||||
|
.to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
if family.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(family)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Filter out junk font names: CSS variables, hex hashes (Next.js font optimization),
|
/// Filter out junk font names: CSS variables, hex hashes (Next.js font optimization),
|
||||||
/// single-character names, and other non-human-readable values.
|
/// single-character names, and other non-human-readable values.
|
||||||
fn is_junk_font_name(name: &str) -> bool {
|
fn is_junk_font_name(name: &str) -> bool {
|
||||||
|
|
@ -630,10 +714,43 @@ fn is_junk_font_name(name: &str) -> bool {
|
||||||
if name.len() >= 8 && name.chars().all(|c| c.is_ascii_hexdigit()) {
|
if name.len() >= 8 && name.chars().all(|c| c.is_ascii_hexdigit()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if name
|
||||||
|
.split_whitespace()
|
||||||
|
.next()
|
||||||
|
.is_some_and(|part| part.len() >= 8 && part.chars().all(|c| c.is_ascii_hexdigit()))
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
// Too short to be a real font name
|
// Too short to be a real font name
|
||||||
if name.len() < 3 {
|
if name.len() < 3 {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
// Third-party rendering libraries and icon fonts overwhelm app shells
|
||||||
|
// like claude.com/openai.com but are not product typography.
|
||||||
|
if name.contains("katex")
|
||||||
|
|| name.contains("open dyslexic")
|
||||||
|
|| name.contains("opendyslexic")
|
||||||
|
|| name.contains("math")
|
||||||
|
|| name.contains("fraktur")
|
||||||
|
|| name.contains("caligraphic")
|
||||||
|
|| name.contains("typewriter")
|
||||||
|
|| name.contains("glyph")
|
||||||
|
|| name.contains("icon")
|
||||||
|
|| name.contains("emoji")
|
||||||
|
|| name.contains("symbol")
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// Malformed shorthand leftovers and CSS-internal values.
|
||||||
|
if name.contains(')')
|
||||||
|
|| name.contains('!')
|
||||||
|
|| name.contains('/')
|
||||||
|
|| name.contains("px ")
|
||||||
|
|| name.contains("rem ")
|
||||||
|
|| name.contains("em ")
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
// Starts with underscore or double dash (CSS internals)
|
// Starts with underscore or double dash (CSS internals)
|
||||||
if name.starts_with('_') || name.starts_with("--") {
|
if name.starts_with('_') || name.starts_with("--") {
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -662,28 +779,11 @@ fn split_font_families(value: &str) -> Vec<String> {
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
fn find_logo(doc: &Html, base_url: Option<&Url>) -> Option<String> {
|
fn find_logo(doc: &Html, base_url: Option<&Url>) -> Option<String> {
|
||||||
// Strategy 1: <img> with class/id containing "logo"
|
if let Some(url) = find_logo_in_scope(doc, base_url, "header img, nav img") {
|
||||||
for el in doc.select(selector!("img")) {
|
return Some(url);
|
||||||
let class = el.value().attr("class").unwrap_or("");
|
|
||||||
let id = el.value().attr("id").unwrap_or("");
|
|
||||||
if (contains_ci(class, "logo") || contains_ci(id, "logo"))
|
|
||||||
&& let Some(src) = el.value().attr("src")
|
|
||||||
{
|
|
||||||
return Some(resolve_url(src, base_url));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Strategy 2: <img> with alt containing "logo"
|
// Strategy 2: <a href="/"> containing an <img> (homepage link with image)
|
||||||
for el in doc.select(selector!("img")) {
|
|
||||||
let alt = el.value().attr("alt").unwrap_or("");
|
|
||||||
if contains_ci(alt, "logo")
|
|
||||||
&& let Some(src) = el.value().attr("src")
|
|
||||||
{
|
|
||||||
return Some(resolve_url(src, base_url));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Strategy 3: <a href="/"> containing an <img> (homepage link with image)
|
|
||||||
for el in doc.select(selector!("a[href='/'] img, a[href] img")) {
|
for el in doc.select(selector!("a[href='/'] img, a[href] img")) {
|
||||||
// Check if parent <a> links to homepage
|
// Check if parent <a> links to homepage
|
||||||
if let Some(parent) = el.parent().and_then(|p| p.value().as_element()) {
|
if let Some(parent) = el.parent().and_then(|p| p.value().as_element()) {
|
||||||
|
|
@ -699,6 +799,20 @@ fn find_logo(doc: &Html, base_url: Option<&Url>) -> Option<String> {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn find_logo_in_scope(doc: &Html, base_url: Option<&Url>, selector_str: &str) -> Option<String> {
|
||||||
|
let selector = Selector::parse(selector_str).ok()?;
|
||||||
|
for el in doc.select(&selector) {
|
||||||
|
let class = el.value().attr("class").unwrap_or("");
|
||||||
|
let id = el.value().attr("id").unwrap_or("");
|
||||||
|
let alt = el.value().attr("alt").unwrap_or("");
|
||||||
|
let src = el.value().attr("src")?;
|
||||||
|
if contains_ci(class, "logo") || contains_ci(id, "logo") || contains_ci(alt, "logo") {
|
||||||
|
return Some(resolve_url(src, base_url));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Favicon detection
|
// Favicon detection
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
@ -829,8 +943,9 @@ fn find_all_logos(doc: &Html, base_url: Option<&Url>) -> Vec<LogoVariant> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Logo images (class/id/alt containing "logo")
|
// Logo images in header/nav first. Product/customer logo grids elsewhere
|
||||||
for el in doc.select(selector!("img")) {
|
// are common on SaaS sites and should not become the primary brand signal.
|
||||||
|
for el in doc.select(selector!("header img, nav img")) {
|
||||||
let class = el.value().attr("class").unwrap_or("");
|
let class = el.value().attr("class").unwrap_or("");
|
||||||
let id = el.value().attr("id").unwrap_or("");
|
let id = el.value().attr("id").unwrap_or("");
|
||||||
let alt = el.value().attr("alt").unwrap_or("");
|
let alt = el.value().attr("alt").unwrap_or("");
|
||||||
|
|
@ -997,6 +1112,25 @@ mod tests {
|
||||||
assert!(hexes.contains(&"#3498DB"), "brand color should survive");
|
assert!(hexes.contains(&"#3498DB"), "brand color should survive");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_google_oauth_palette_does_not_overwhelm_non_google_brand() {
|
||||||
|
let html = r#"<html><head>
|
||||||
|
<meta property="og:site_name" content="Claude">
|
||||||
|
<style>
|
||||||
|
.google-button { color: #1A73E8; background: #4285F4; border-color: #5F6368; }
|
||||||
|
.google-icon { color: #202124; background: #E8EAED; }
|
||||||
|
:root { --brand-accent: #D97757; --brand-text: #DC6038; }
|
||||||
|
</style>
|
||||||
|
</head><body></body></html>"#;
|
||||||
|
|
||||||
|
let brand = extract_brand(html, None);
|
||||||
|
let hexes: Vec<&str> = brand.colors.iter().map(|c| c.hex.as_str()).collect();
|
||||||
|
assert!(!hexes.contains(&"#1A73E8"));
|
||||||
|
assert!(!hexes.contains(&"#4285F4"));
|
||||||
|
assert!(hexes.contains(&"#D97757"));
|
||||||
|
assert!(hexes.contains(&"#DC6038"));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_font_extraction() {
|
fn test_font_extraction() {
|
||||||
let html = r#"<html><head><style>
|
let html = r#"<html><head><style>
|
||||||
|
|
@ -1040,6 +1174,24 @@ mod tests {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_font_shorthand_is_normalized_and_noise_filtered() {
|
||||||
|
let html = r#"<html><head><style>
|
||||||
|
body { font: 500 12px "Roboto", Arial, sans-serif; }
|
||||||
|
h1 { font: 1.21em/1.2 KaTeX_Main; }
|
||||||
|
.hash { font-family: "9d9927955a95a20d s", "OpenAI Sans", sans-serif; }
|
||||||
|
.bad { font-family: "Noto Color Emoji\")", "Segoe UI Emoji"; }
|
||||||
|
</style></head><body></body></html>"#;
|
||||||
|
|
||||||
|
let brand = extract_brand(html, None);
|
||||||
|
assert!(brand.fonts.contains(&"Roboto".to_string()));
|
||||||
|
assert!(brand.fonts.contains(&"OpenAI Sans".to_string()));
|
||||||
|
assert!(!brand.fonts.iter().any(|f| f.contains("12px")));
|
||||||
|
assert!(!brand.fonts.iter().any(|f| f.contains("KaTeX")));
|
||||||
|
assert!(!brand.fonts.iter().any(|f| f.contains("Emoji")));
|
||||||
|
assert!(!brand.fonts.iter().any(|f| f.contains("9d9927955a95a20d")));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_logo_by_class() {
|
fn test_logo_by_class() {
|
||||||
let html = r#"<html><body>
|
let html = r#"<html><body>
|
||||||
|
|
@ -1086,6 +1238,42 @@ mod tests {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_body_logo_grid_does_not_become_primary_brand_logo() {
|
||||||
|
let html = r#"<html><body>
|
||||||
|
<main>
|
||||||
|
<section class="customers">
|
||||||
|
<img class="customer-logo" src="/logos/runway.svg" alt="Runway logo">
|
||||||
|
<img class="customer-logo" src="/logos/zapier.svg" alt="Zapier logo">
|
||||||
|
</section>
|
||||||
|
</main>
|
||||||
|
</body></html>"#;
|
||||||
|
|
||||||
|
let brand = extract_brand(html, Some("https://example.com"));
|
||||||
|
assert_eq!(brand.logo_url, None);
|
||||||
|
assert!(brand.logos.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_header_logo_is_still_primary_logo() {
|
||||||
|
let html = r#"<html><body>
|
||||||
|
<header>
|
||||||
|
<img class="brand-logo" src="/logo.svg" alt="Acme logo">
|
||||||
|
</header>
|
||||||
|
<main>
|
||||||
|
<img class="customer-logo" src="/logos/customer.svg" alt="Customer logo">
|
||||||
|
</main>
|
||||||
|
</body></html>"#;
|
||||||
|
|
||||||
|
let brand = extract_brand(html, Some("https://example.com"));
|
||||||
|
assert_eq!(
|
||||||
|
brand.logo_url.as_deref(),
|
||||||
|
Some("https://example.com/logo.svg")
|
||||||
|
);
|
||||||
|
assert_eq!(brand.logos.len(), 1);
|
||||||
|
assert_eq!(brand.logos[0].url, "https://example.com/logo.svg");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_favicon() {
|
fn test_favicon() {
|
||||||
let html = r#"<html><head>
|
let html = r#"<html><head>
|
||||||
|
|
|
||||||
515
crates/webclaw-core/src/endpoints.rs
Normal file
515
crates/webclaw-core/src/endpoints.rs
Normal file
|
|
@ -0,0 +1,515 @@
|
||||||
|
//! API/endpoint surface discovery from HTML + JS bundle text.
|
||||||
|
//!
|
||||||
|
//! Pure and zero-network: callers fetch the page and its `<script src>`
|
||||||
|
//! bundles, then hand the raw text here. We surface API paths, absolute
|
||||||
|
//! API URLs, GraphQL and WebSocket endpoints that live in inline scripts
|
||||||
|
//! and bundles — the surface a sitemap/`map` can never see.
|
||||||
|
//!
|
||||||
|
//! Heuristic by design: regex over string literals, not JS dataflow.
|
||||||
|
//! High-signal patterns only; bounded for DoS safety.
|
||||||
|
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use regex::Regex;
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
use std::collections::BTreeSet;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
/// Hard caps so a hostile/huge bundle set can't blow up CPU or memory.
|
||||||
|
const MAX_SCAN_BYTES: usize = 8 * 1024 * 1024;
|
||||||
|
const MAX_ENDPOINTS: usize = 2000;
|
||||||
|
/// Cap on `<script src>` URLs returned for the caller to fetch.
|
||||||
|
const MAX_SCRIPT_SRCS: usize = 40;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize)]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub enum EndpointKind {
|
||||||
|
RelativePath,
|
||||||
|
AbsoluteUrl,
|
||||||
|
GraphQl,
|
||||||
|
WebSocket,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, serde::Serialize)]
|
||||||
|
pub struct DiscoveredEndpoint {
|
||||||
|
pub value: String,
|
||||||
|
pub kind: EndpointKind,
|
||||||
|
pub first_party: bool,
|
||||||
|
/// `"inline"` or the bundle URL the match came from.
|
||||||
|
pub source: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default, serde::Serialize)]
|
||||||
|
pub struct EndpointReport {
|
||||||
|
pub endpoints: Vec<DiscoveredEndpoint>,
|
||||||
|
/// Distinct hosts seen across absolute URLs (first- and third-party).
|
||||||
|
pub hosts: Vec<String>,
|
||||||
|
pub bundles_scanned: usize,
|
||||||
|
/// True if a cap was hit and results may be incomplete.
|
||||||
|
pub truncated: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Quoted relative path that looks API-ish. Bounded quantifiers; the `regex`
|
||||||
|
// crate is linear-time (RE2) so this cannot catastrophically backtrack.
|
||||||
|
static RE_REL_PATH: Lazy<Regex> = Lazy::new(|| {
|
||||||
|
Regex::new(
|
||||||
|
r#"["'`](/[A-Za-z0-9_\-./]{0,200}?(?:api|graphql|gql|/v[0-9]|/rest|/gateway|/internal|/discovery)[A-Za-z0-9_\-./]{0,200})["'`]"#,
|
||||||
|
)
|
||||||
|
.expect("RE_REL_PATH")
|
||||||
|
});
|
||||||
|
|
||||||
|
static RE_ABS_URL: Lazy<Regex> = Lazy::new(|| {
|
||||||
|
Regex::new(r#"https?://[A-Za-z0-9.\-]{1,253}(?:/[A-Za-z0-9_\-./%]{0,400})?"#)
|
||||||
|
.expect("RE_ABS_URL")
|
||||||
|
});
|
||||||
|
|
||||||
|
static RE_WS: Lazy<Regex> = Lazy::new(|| {
|
||||||
|
Regex::new(r#"wss?://[A-Za-z0-9.\-]{1,253}(?:/[A-Za-z0-9_\-./%]{0,256})?"#).expect("RE_WS")
|
||||||
|
});
|
||||||
|
|
||||||
|
static SCRIPT_SEL: Lazy<Selector> = Lazy::new(|| Selector::parse("script").expect("script sel"));
|
||||||
|
|
||||||
|
/// Common multi-label public suffixes so `ticketmaster.co.uk` resolves to
|
||||||
|
/// `ticketmaster.co.uk` (not `co.uk`). Not a full PSL — pragmatic v1.
|
||||||
|
const SUFFIX2: &[&str] = &[
|
||||||
|
"co.uk", "org.uk", "gov.uk", "ac.uk", "me.uk", "com.au", "net.au", "org.au", "co.jp", "co.nz",
|
||||||
|
"co.za", "com.br", "com.mx", "com.sg", "co.in", "co.kr", "com.tr", "com.cn",
|
||||||
|
];
|
||||||
|
|
||||||
|
fn registrable_domain(host: &str) -> String {
|
||||||
|
let host = host.trim_end_matches('.').to_ascii_lowercase();
|
||||||
|
let labels: Vec<&str> = host.split('.').collect();
|
||||||
|
if labels.len() < 2 {
|
||||||
|
return host;
|
||||||
|
}
|
||||||
|
let last2 = labels[labels.len() - 2..].join(".");
|
||||||
|
if SUFFIX2.contains(&last2.as_str()) && labels.len() >= 3 {
|
||||||
|
labels[labels.len() - 3..].join(".")
|
||||||
|
} else {
|
||||||
|
last2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_first_party(candidate_host: &str, base_reg: &str) -> bool {
|
||||||
|
let ch = candidate_host.to_ascii_lowercase();
|
||||||
|
ch == base_reg || ch.ends_with(&format!(".{base_reg}"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Registrable domains that are spec/schema/example noise, never real API
|
||||||
|
/// surface (minified JSON-Schema/`schema.org` refs show up constantly).
|
||||||
|
const NOISE_HOSTS: &[&str] = &[
|
||||||
|
"schema.org",
|
||||||
|
"json-schema.org",
|
||||||
|
"w3.org",
|
||||||
|
"example.com",
|
||||||
|
"example.org",
|
||||||
|
"example.net",
|
||||||
|
"localhost",
|
||||||
|
];
|
||||||
|
|
||||||
|
/// A host worth reporting: multi-label with an alphabetic TLD (>=2 chars).
|
||||||
|
/// Rejects minifier garbage like `http://f` / `http://n` and UUID-ish
|
||||||
|
/// single labels that the URL regex otherwise picks up.
|
||||||
|
fn is_valid_host(host: &str) -> bool {
|
||||||
|
let h = host.trim_end_matches('.');
|
||||||
|
let labels: Vec<&str> = h.split('.').collect();
|
||||||
|
if labels.len() < 2 || labels.iter().any(|l| l.is_empty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
let tld = labels[labels.len() - 1];
|
||||||
|
tld.len() >= 2 && tld.chars().all(|c| c.is_ascii_alphabetic())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bare/low-signal relative paths that are just the prefix, not an endpoint
|
||||||
|
/// (e.g. `/api`, `/api/`, `/`). `/graphql`, `/gql`, `/api/x` are kept.
|
||||||
|
fn is_noise_path(p: &str) -> bool {
|
||||||
|
let t = p.trim_end_matches('/');
|
||||||
|
t.len() < 4 || matches!(t, "/api" | "/rest")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolved absolute `<script src>` URLs (http/https only), deduped, capped.
|
||||||
|
/// Inline scripts have no `src` and are scanned via [`extract_endpoints`].
|
||||||
|
pub fn script_srcs(html: &str, base_url: &str) -> Vec<String> {
|
||||||
|
let base = Url::parse(base_url).ok();
|
||||||
|
let doc = Html::parse_document(html);
|
||||||
|
let mut seen = BTreeSet::new();
|
||||||
|
let mut out = Vec::new();
|
||||||
|
for el in doc.select(&SCRIPT_SEL) {
|
||||||
|
if out.len() >= MAX_SCRIPT_SRCS {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let Some(src) = el.value().attr("src") else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let resolved = match Url::parse(src) {
|
||||||
|
Ok(u) => Some(u),
|
||||||
|
Err(_) => base.as_ref().and_then(|b| b.join(src).ok()),
|
||||||
|
};
|
||||||
|
let Some(u) = resolved else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
if (u.scheme() == "http" || u.scheme() == "https") && seen.insert(u.to_string()) {
|
||||||
|
out.push(u.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract endpoints from inline HTML scripts plus pre-fetched JS bundles.
|
||||||
|
/// `bundles` is `(bundle_url, bundle_text)`.
|
||||||
|
pub fn extract_endpoints(
|
||||||
|
html: &str,
|
||||||
|
base_url: &str,
|
||||||
|
bundles: &[(String, String)],
|
||||||
|
) -> EndpointReport {
|
||||||
|
let base_reg = Url::parse(base_url)
|
||||||
|
.ok()
|
||||||
|
.and_then(|u| u.host_str().map(registrable_domain))
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
let mut endpoints: Vec<DiscoveredEndpoint> = Vec::new();
|
||||||
|
let mut seen: BTreeSet<(String, String)> = BTreeSet::new();
|
||||||
|
let mut hosts: BTreeSet<String> = BTreeSet::new();
|
||||||
|
let mut budget = MAX_SCAN_BYTES;
|
||||||
|
let mut truncated = false;
|
||||||
|
|
||||||
|
let push = |value: String,
|
||||||
|
kind: EndpointKind,
|
||||||
|
source: &str,
|
||||||
|
endpoints: &mut Vec<DiscoveredEndpoint>,
|
||||||
|
seen: &mut BTreeSet<(String, String)>,
|
||||||
|
hosts: &mut BTreeSet<String>|
|
||||||
|
-> bool {
|
||||||
|
if endpoints.len() >= MAX_ENDPOINTS {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
let first_party = match Url::parse(&value) {
|
||||||
|
Ok(u) => {
|
||||||
|
let Some(h) = u.host_str() else {
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
if !is_valid_host(h) {
|
||||||
|
return true; // minifier garbage host
|
||||||
|
}
|
||||||
|
if NOISE_HOSTS.contains(®istrable_domain(h).as_str()) {
|
||||||
|
return true; // schema.org / json-schema.org / example.*
|
||||||
|
}
|
||||||
|
// Absolute URL with no real path is an origin/site link,
|
||||||
|
// not an API endpoint (drops the page's own URL too).
|
||||||
|
let path = u.path();
|
||||||
|
if path.is_empty() || path == "/" {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
hosts.insert(h.to_ascii_lowercase());
|
||||||
|
is_first_party(h, &base_reg)
|
||||||
|
}
|
||||||
|
// Relative path: same origin as the page by definition.
|
||||||
|
Err(_) => {
|
||||||
|
if is_noise_path(&value) {
|
||||||
|
return true; // bare /api, /, ultra-short
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if seen.insert((value.clone(), source.to_string())) {
|
||||||
|
endpoints.push(DiscoveredEndpoint {
|
||||||
|
value,
|
||||||
|
kind,
|
||||||
|
first_party,
|
||||||
|
source: source.to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
true
|
||||||
|
};
|
||||||
|
|
||||||
|
let scan = |text: &str,
|
||||||
|
source: &str,
|
||||||
|
endpoints: &mut Vec<DiscoveredEndpoint>,
|
||||||
|
seen: &mut BTreeSet<(String, String)>,
|
||||||
|
hosts: &mut BTreeSet<String>,
|
||||||
|
budget: &mut usize,
|
||||||
|
truncated: &mut bool| {
|
||||||
|
if *budget == 0 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let slice = if text.len() > *budget {
|
||||||
|
*truncated = true;
|
||||||
|
&text[..*budget]
|
||||||
|
} else {
|
||||||
|
text
|
||||||
|
};
|
||||||
|
*budget -= slice.len();
|
||||||
|
|
||||||
|
for c in RE_REL_PATH.captures_iter(slice) {
|
||||||
|
if let Some(m) = c.get(1) {
|
||||||
|
let v = m.as_str().to_string();
|
||||||
|
let kind = if v.contains("graphql") || v.contains("/gql") {
|
||||||
|
EndpointKind::GraphQl
|
||||||
|
} else {
|
||||||
|
EndpointKind::RelativePath
|
||||||
|
};
|
||||||
|
if !push(v, kind, source, endpoints, seen, hosts) {
|
||||||
|
*truncated = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for m in RE_WS.find_iter(slice) {
|
||||||
|
if !push(
|
||||||
|
m.as_str().to_string(),
|
||||||
|
EndpointKind::WebSocket,
|
||||||
|
source,
|
||||||
|
endpoints,
|
||||||
|
seen,
|
||||||
|
hosts,
|
||||||
|
) {
|
||||||
|
*truncated = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for m in RE_ABS_URL.find_iter(slice) {
|
||||||
|
let v = m.as_str().to_string();
|
||||||
|
// Skip obvious static assets — we want API surface, not CDN files.
|
||||||
|
let lower = v.to_ascii_lowercase();
|
||||||
|
if lower.ends_with(".js")
|
||||||
|
|| lower.ends_with(".css")
|
||||||
|
|| lower.ends_with(".png")
|
||||||
|
|| lower.ends_with(".jpg")
|
||||||
|
|| lower.ends_with(".svg")
|
||||||
|
|| lower.ends_with(".woff2")
|
||||||
|
{
|
||||||
|
// still record the host for visibility
|
||||||
|
if let Some(h) = Url::parse(&v)
|
||||||
|
.ok()
|
||||||
|
.and_then(|u| u.host_str().map(str::to_string))
|
||||||
|
{
|
||||||
|
hosts.insert(h.to_ascii_lowercase());
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let kind = if lower.contains("graphql") || lower.contains("/gql") {
|
||||||
|
EndpointKind::GraphQl
|
||||||
|
} else {
|
||||||
|
EndpointKind::AbsoluteUrl
|
||||||
|
};
|
||||||
|
if !push(v, kind, source, endpoints, seen, hosts) {
|
||||||
|
*truncated = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Inline scripts.
|
||||||
|
let doc = Html::parse_document(html);
|
||||||
|
let mut inline = String::new();
|
||||||
|
for el in doc.select(&SCRIPT_SEL) {
|
||||||
|
if el.value().attr("src").is_none() {
|
||||||
|
inline.push_str(&el.text().collect::<String>());
|
||||||
|
inline.push('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
scan(
|
||||||
|
&inline,
|
||||||
|
"inline",
|
||||||
|
&mut endpoints,
|
||||||
|
&mut seen,
|
||||||
|
&mut hosts,
|
||||||
|
&mut budget,
|
||||||
|
&mut truncated,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Bundles.
|
||||||
|
let mut bundles_scanned = 0usize;
|
||||||
|
for (src, text) in bundles {
|
||||||
|
if budget == 0 {
|
||||||
|
truncated = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
bundles_scanned += 1;
|
||||||
|
scan(
|
||||||
|
text,
|
||||||
|
src,
|
||||||
|
&mut endpoints,
|
||||||
|
&mut seen,
|
||||||
|
&mut hosts,
|
||||||
|
&mut budget,
|
||||||
|
&mut truncated,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
endpoints.sort_by(|a, b| (a.kind, &a.value, &a.source).cmp(&(b.kind, &b.value, &b.source)));
|
||||||
|
|
||||||
|
EndpointReport {
|
||||||
|
endpoints,
|
||||||
|
hosts: hosts.into_iter().collect(),
|
||||||
|
bundles_scanned,
|
||||||
|
truncated,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn registrable_domain_handles_cc_tlds() {
|
||||||
|
assert_eq!(
|
||||||
|
registrable_domain("www.ticketmaster.co.uk"),
|
||||||
|
"ticketmaster.co.uk"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
registrable_domain("api.ticketmaster.com"),
|
||||||
|
"ticketmaster.com"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
registrable_domain("pubapi.ticketmaster.co.uk"),
|
||||||
|
"ticketmaster.co.uk"
|
||||||
|
);
|
||||||
|
assert_eq!(registrable_domain("localhost"), "localhost");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn script_srcs_resolves_and_filters() {
|
||||||
|
let html = r#"<html><head>
|
||||||
|
<script src="/_next/static/chunks/main-abc.js"></script>
|
||||||
|
<script src="https://cdn.example.net/lib.js"></script>
|
||||||
|
<script>var inline = 1;</script>
|
||||||
|
<script src="data:text/javascript,1"></script>
|
||||||
|
</head></html>"#;
|
||||||
|
let srcs = script_srcs(html, "https://www.ticketmaster.co.uk/");
|
||||||
|
assert!(srcs.contains(
|
||||||
|
&"https://www.ticketmaster.co.uk/_next/static/chunks/main-abc.js".to_string()
|
||||||
|
));
|
||||||
|
assert!(srcs.contains(&"https://cdn.example.net/lib.js".to_string()));
|
||||||
|
assert_eq!(srcs.len(), 2, "inline + data: ignored");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn extracts_inline_and_bundle_endpoints_with_classification() {
|
||||||
|
let html = r#"<html><body>
|
||||||
|
<script>
|
||||||
|
var cfg = { search: "/api/search/events", suggest: "/api/search/search-suggest" };
|
||||||
|
fetch("/api/venue/info");
|
||||||
|
</script>
|
||||||
|
<script src="/app.js"></script>
|
||||||
|
</body></html>"#;
|
||||||
|
let bundles = vec![(
|
||||||
|
"https://www.ticketmaster.co.uk/app.js".to_string(),
|
||||||
|
r#"
|
||||||
|
const GQL = "https://pubapi.ticketmaster.co.uk/graphql";
|
||||||
|
axios.post("https://services.ticketmaster.co.uk/discovery/v2/events");
|
||||||
|
new WebSocket("wss://live.ticketmaster.co.uk/socket");
|
||||||
|
const ga = "https://www.googletagservices.com/tag/js/gpt.js";
|
||||||
|
const img = "https://cdn.tmol.co/hero.png";
|
||||||
|
"#
|
||||||
|
.to_string(),
|
||||||
|
)];
|
||||||
|
let r = extract_endpoints(html, "https://www.ticketmaster.co.uk/", &bundles);
|
||||||
|
let vals: Vec<&str> = r.endpoints.iter().map(|e| e.value.as_str()).collect();
|
||||||
|
|
||||||
|
assert!(vals.contains(&"/api/search/events"));
|
||||||
|
assert!(vals.contains(&"/api/search/search-suggest"));
|
||||||
|
assert!(vals.contains(&"/api/venue/info"));
|
||||||
|
assert!(vals.contains(&"https://pubapi.ticketmaster.co.uk/graphql"));
|
||||||
|
assert!(vals.contains(&"https://services.ticketmaster.co.uk/discovery/v2/events"));
|
||||||
|
assert!(vals.contains(&"wss://live.ticketmaster.co.uk/socket"));
|
||||||
|
// static .js asset is not an endpoint, but its host is recorded
|
||||||
|
assert!(!vals.contains(&"https://www.googletagservices.com/tag/js/gpt.js"));
|
||||||
|
assert!(r.hosts.iter().any(|h| h == "www.googletagservices.com"));
|
||||||
|
|
||||||
|
let gql = r
|
||||||
|
.endpoints
|
||||||
|
.iter()
|
||||||
|
.find(|e| e.value.contains("graphql"))
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(gql.kind, EndpointKind::GraphQl);
|
||||||
|
assert!(
|
||||||
|
gql.first_party,
|
||||||
|
"pubapi.ticketmaster.co.uk is first-party to .co.uk"
|
||||||
|
);
|
||||||
|
|
||||||
|
let third = r
|
||||||
|
.endpoints
|
||||||
|
.iter()
|
||||||
|
.find(|e| e.value.starts_with("/api/venue"));
|
||||||
|
assert!(third.unwrap().first_party, "relative path is same-origin");
|
||||||
|
assert_eq!(r.bundles_scanned, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn third_party_absolute_is_flagged_not_first_party() {
|
||||||
|
let bundles = vec![(
|
||||||
|
"b".to_string(),
|
||||||
|
r#"x="https://api.stripe.com/v1/charges""#.to_string(),
|
||||||
|
)];
|
||||||
|
let r = extract_endpoints("<html></html>", "https://www.ticketmaster.co.uk/", &bundles);
|
||||||
|
let e = r
|
||||||
|
.endpoints
|
||||||
|
.iter()
|
||||||
|
.find(|e| e.value.contains("stripe"))
|
||||||
|
.unwrap();
|
||||||
|
assert!(!e.first_party);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn caps_bound_pathological_input() {
|
||||||
|
// A huge blob of fake endpoints must not exceed MAX_ENDPOINTS and
|
||||||
|
// must return promptly (regex crate is linear-time).
|
||||||
|
let mut big = String::new();
|
||||||
|
for i in 0..50_000 {
|
||||||
|
big.push_str(&format!("\"/api/v1/item/{i}\" "));
|
||||||
|
}
|
||||||
|
let bundles = vec![("big".to_string(), big)];
|
||||||
|
let r = extract_endpoints("<html></html>", "https://x.com/", &bundles);
|
||||||
|
assert!(r.endpoints.len() <= MAX_ENDPOINTS);
|
||||||
|
assert!(r.truncated);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_inputs_are_safe() {
|
||||||
|
let r = extract_endpoints("", "not a url", &[]);
|
||||||
|
assert!(r.endpoints.is_empty());
|
||||||
|
assert_eq!(r.bundles_scanned, 0);
|
||||||
|
assert!(!r.truncated);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn v1_1_noise_is_filtered() {
|
||||||
|
let bundles = vec![(
|
||||||
|
"b.js".to_string(),
|
||||||
|
r#"
|
||||||
|
"/api/search/events";
|
||||||
|
"/api"; "/api/";
|
||||||
|
"http://f"; "http://n/x";
|
||||||
|
"https://schema.org/Thing";
|
||||||
|
"http://json-schema.org/draft-07/schema";
|
||||||
|
"https://www.ticketmaster.co.uk/";
|
||||||
|
"https://pubapi.ticketmaster.co.uk/discovery/v2/events";
|
||||||
|
"wss://live.ticketmaster.co.uk/socket";
|
||||||
|
"#
|
||||||
|
.to_string(),
|
||||||
|
)];
|
||||||
|
let r = extract_endpoints("<html></html>", "https://www.ticketmaster.co.uk/", &bundles);
|
||||||
|
let vals: std::collections::BTreeSet<&str> =
|
||||||
|
r.endpoints.iter().map(|e| e.value.as_str()).collect();
|
||||||
|
assert!(vals.contains("/api/search/events"));
|
||||||
|
assert!(vals.contains("https://pubapi.ticketmaster.co.uk/discovery/v2/events"));
|
||||||
|
assert!(vals.contains("wss://live.ticketmaster.co.uk/socket"));
|
||||||
|
for junk in [
|
||||||
|
"/api",
|
||||||
|
"/api/",
|
||||||
|
"http://f",
|
||||||
|
"http://n/x",
|
||||||
|
"https://schema.org/Thing",
|
||||||
|
"http://json-schema.org/draft-07/schema",
|
||||||
|
"https://www.ticketmaster.co.uk/",
|
||||||
|
] {
|
||||||
|
assert!(!vals.contains(junk), "noise leaked: {junk}");
|
||||||
|
}
|
||||||
|
assert!(
|
||||||
|
!r.hosts
|
||||||
|
.iter()
|
||||||
|
.any(|h| h == "f" || h == "n" || h == "schema.org")
|
||||||
|
);
|
||||||
|
assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -9,10 +9,12 @@ use once_cell::sync::Lazy;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use rquickjs::{Context, Runtime};
|
use rquickjs::{Context, Runtime};
|
||||||
use scraper::{Html, Selector};
|
use scraper::{Html, Selector};
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").unwrap());
|
static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").unwrap());
|
||||||
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
|
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
|
||||||
|
const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);
|
||||||
|
|
||||||
/// A blob of data extracted from JS execution.
|
/// A blob of data extracted from JS execution.
|
||||||
pub struct JsDataBlob {
|
pub struct JsDataBlob {
|
||||||
|
|
@ -49,6 +51,8 @@ pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
|
||||||
let rt = Runtime::new().expect("QuickJS runtime creation failed");
|
let rt = Runtime::new().expect("QuickJS runtime creation failed");
|
||||||
rt.set_memory_limit(64 * 1024 * 1024); // 64 MB
|
rt.set_memory_limit(64 * 1024 * 1024); // 64 MB
|
||||||
rt.set_max_stack_size(1024 * 1024); // 1 MB
|
rt.set_max_stack_size(1024 * 1024); // 1 MB
|
||||||
|
let deadline = Instant::now() + JS_EVAL_TIMEOUT;
|
||||||
|
rt.set_interrupt_handler(Some(Box::new(move || Instant::now() >= deadline)));
|
||||||
|
|
||||||
let ctx = Context::full(&rt).expect("QuickJS context creation failed");
|
let ctx = Context::full(&rt).expect("QuickJS context creation failed");
|
||||||
|
|
||||||
|
|
@ -464,6 +468,8 @@ fn walk_rsc_tree(value: &serde_json::Value, out: &mut Vec<String>, depth: usize)
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
@ -493,6 +499,29 @@ mod tests {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn js_eval_interrupts_infinite_loops() {
|
||||||
|
let html = r#"
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<script>
|
||||||
|
while (true) {}
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body>hello</body>
|
||||||
|
</html>
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let start = Instant::now();
|
||||||
|
let blobs = extract_js_data(html);
|
||||||
|
|
||||||
|
assert!(blobs.is_empty());
|
||||||
|
assert!(
|
||||||
|
start.elapsed() < Duration::from_secs(2),
|
||||||
|
"QuickJS execution should be interrupted quickly"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn skips_external_and_module_scripts() {
|
fn skips_external_and_module_scripts() {
|
||||||
let html = r#"<html><body>
|
let html = r#"<html><body>
|
||||||
|
|
|
||||||
|
|
@ -7,15 +7,17 @@ pub(crate) mod data_island;
|
||||||
/// Zero network dependencies — WASM-compatible by design.
|
/// Zero network dependencies — WASM-compatible by design.
|
||||||
pub mod diff;
|
pub mod diff;
|
||||||
pub mod domain;
|
pub mod domain;
|
||||||
|
pub mod endpoints;
|
||||||
pub mod error;
|
pub mod error;
|
||||||
pub mod extractor;
|
pub mod extractor;
|
||||||
#[cfg(feature = "quickjs")]
|
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||||
pub mod js_eval;
|
pub mod js_eval;
|
||||||
pub mod llm;
|
pub mod llm;
|
||||||
pub mod markdown;
|
pub mod markdown;
|
||||||
pub mod metadata;
|
pub mod metadata;
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
pub(crate) mod noise;
|
pub(crate) mod noise;
|
||||||
|
pub mod reddit;
|
||||||
pub mod structured_data;
|
pub mod structured_data;
|
||||||
pub mod types;
|
pub mod types;
|
||||||
pub mod youtube;
|
pub mod youtube;
|
||||||
|
|
@ -46,9 +48,13 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
|
||||||
/// `url` — optional source URL, used for resolving relative links and domain detection
|
/// `url` — optional source URL, used for resolving relative links and domain detection
|
||||||
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
|
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
|
||||||
///
|
///
|
||||||
/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
|
/// On native targets, spawns extraction on a thread with an 8 MB stack to
|
||||||
/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
|
/// handle deeply nested HTML (e.g., Express.co.uk live blogs) without
|
||||||
/// main-thread stack on Windows.
|
/// overflowing the default 1-2 MB main-thread stack on Windows.
|
||||||
|
///
|
||||||
|
/// On `wasm32`, threads are unavailable (`std::thread::spawn` panics at
|
||||||
|
/// runtime), so extraction runs inline on the caller's stack.
|
||||||
|
#[cfg(not(target_arch = "wasm32"))]
|
||||||
pub fn extract_with_options(
|
pub fn extract_with_options(
|
||||||
html: &str,
|
html: &str,
|
||||||
url: Option<&str>,
|
url: Option<&str>,
|
||||||
|
|
@ -70,6 +76,16 @@ pub fn extract_with_options(
|
||||||
.unwrap_or(Err(ExtractError::NoContent))
|
.unwrap_or(Err(ExtractError::NoContent))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// WASM has no threads; run extraction directly on the caller's stack.
|
||||||
|
#[cfg(target_arch = "wasm32")]
|
||||||
|
pub fn extract_with_options(
|
||||||
|
html: &str,
|
||||||
|
url: Option<&str>,
|
||||||
|
options: &ExtractionOptions,
|
||||||
|
) -> Result<ExtractionResult, ExtractError> {
|
||||||
|
extract_with_options_inner(html, url, options)
|
||||||
|
}
|
||||||
|
|
||||||
fn extract_with_options_inner(
|
fn extract_with_options_inner(
|
||||||
html: &str,
|
html: &str,
|
||||||
url: Option<&str>,
|
url: Option<&str>,
|
||||||
|
|
@ -79,6 +95,24 @@ fn extract_with_options_inner(
|
||||||
return Err(ExtractError::NoContent);
|
return Err(ExtractError::NoContent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reddit fast path: parse old.reddit.com HTML directly.
|
||||||
|
// The fetch layer rewrites all Reddit hosts to old.reddit.com before
|
||||||
|
// calling extract, so we always get stable server-rendered HTML here.
|
||||||
|
if let Some(u) = url
|
||||||
|
&& reddit::is_reddit_url(u)
|
||||||
|
{
|
||||||
|
if let Some(result) = reddit::try_extract(html, u) {
|
||||||
|
return Ok(result);
|
||||||
|
}
|
||||||
|
// A recognised comment thread that we couldn't parse (Reddit markup
|
||||||
|
// change, or a block/challenge page) — don't fall through to generic
|
||||||
|
// extraction, which would emit Reddit nav/sidebar chrome. Listings
|
||||||
|
// and profiles (no `/comments/`) intentionally fall through below.
|
||||||
|
if u.contains("/comments/") {
|
||||||
|
return Err(ExtractError::NoContent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// YouTube fast path: if the URL is a YouTube video page, try extracting
|
// YouTube fast path: if the URL is a YouTube video page, try extracting
|
||||||
// structured metadata from ytInitialPlayerResponse before DOM scoring.
|
// structured metadata from ytInitialPlayerResponse before DOM scoring.
|
||||||
// This gives LLMs a clean, structured view of video metadata.
|
// This gives LLMs a clean, structured view of video metadata.
|
||||||
|
|
@ -187,7 +221,7 @@ fn extract_with_options_inner(
|
||||||
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
|
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
|
||||||
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
|
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
|
||||||
// static JSON data island extraction above with runtime-evaluated data.
|
// static JSON data island extraction above with runtime-evaluated data.
|
||||||
#[cfg(feature = "quickjs")]
|
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||||
{
|
{
|
||||||
let blobs = js_eval::extract_js_data(html);
|
let blobs = js_eval::extract_js_data(html);
|
||||||
if !blobs.is_empty() {
|
if !blobs.is_empty() {
|
||||||
|
|
@ -603,4 +637,36 @@ mod tests {
|
||||||
"Should extract content from deep nesting"
|
"Should extract content from deep nesting"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn wasm_direct_call_path_extracts_content() {
|
||||||
|
// On wasm32 `extract_with_options` runs `extract_with_options_inner`
|
||||||
|
// inline (no thread spawn). Exercise that exact entry point here so
|
||||||
|
// the WASM path stays covered on native CI, and assert it produces
|
||||||
|
// the same content as the public threaded entry point.
|
||||||
|
let html = r#"
|
||||||
|
<html lang="en">
|
||||||
|
<head><title>WASM Path</title></head>
|
||||||
|
<body><article><h1>Heading</h1><p>WASM-safe extraction body content.</p></article></body>
|
||||||
|
</html>"#;
|
||||||
|
let opts = ExtractionOptions::default();
|
||||||
|
|
||||||
|
let inner = extract_with_options_inner(html, Some("https://example.com"), &opts)
|
||||||
|
.expect("inner extraction (wasm path) should succeed");
|
||||||
|
assert!(
|
||||||
|
inner
|
||||||
|
.content
|
||||||
|
.markdown
|
||||||
|
.contains("WASM-safe extraction body content"),
|
||||||
|
"wasm direct-call path should extract body, got: {}",
|
||||||
|
inner.content.markdown
|
||||||
|
);
|
||||||
|
|
||||||
|
let threaded = extract_with_options(html, Some("https://example.com"), &opts)
|
||||||
|
.expect("threaded extraction should succeed");
|
||||||
|
assert_eq!(
|
||||||
|
inner.content.markdown, threaded.content.markdown,
|
||||||
|
"wasm path and threaded path must produce identical content"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,9 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody {
|
||||||
// 0c. Strip leaked JavaScript (framework hydration, self.__wrap_n, etc.)
|
// 0c. Strip leaked JavaScript (framework hydration, self.__wrap_n, etc.)
|
||||||
let text = cleanup::strip_leaked_js(&text);
|
let text = cleanup::strip_leaked_js(&text);
|
||||||
|
|
||||||
|
// 0c2. Strip a11y link chrome ("opens new tab", external link hints)
|
||||||
|
let text = cleanup::strip_a11y_link_chrome(&text);
|
||||||
|
|
||||||
// 0d. Collapse spaced-out text (CSS animation artifacts like "S t a r t")
|
// 0d. Collapse spaced-out text (CSS animation artifacts like "S t a r t")
|
||||||
// Must run before any dedup -- spaced text confuses word-based dedup.
|
// Must run before any dedup -- spaced text confuses word-based dedup.
|
||||||
let text = cleanup::collapse_spaced_text(&text);
|
let text = cleanup::collapse_spaced_text(&text);
|
||||||
|
|
@ -70,7 +73,15 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody {
|
||||||
// d. Extract links, replace inline `[text](url)` with just `text`
|
// d. Extract links, replace inline `[text](url)` with just `text`
|
||||||
let (text, extracted_links) = links::extract_and_strip_links(&text);
|
let (text, extracted_links) = links::extract_and_strip_links(&text);
|
||||||
|
|
||||||
// d2. Collapse repeated adjacent phrases on the same line
|
// d1. Strip bare-integer paragraphs after link extraction, so
|
||||||
|
// `[0](#comments)` collapses to `0` before the paragraph-aware check.
|
||||||
|
let text = cleanup::strip_bare_number_lines(&text);
|
||||||
|
|
||||||
|
// d2. Run UI-control stripping again after link extraction. Lines like
|
||||||
|
// `[0](url) Next` become `0 Next`, which is pure pagination chrome.
|
||||||
|
let text = cleanup::strip_ui_control_text(&text);
|
||||||
|
|
||||||
|
// d3. Collapse repeated adjacent phrases on the same line
|
||||||
// (responsive variants: "Read more Read more Read more" -> "Read more")
|
// (responsive variants: "Read more Read more Read more" -> "Read more")
|
||||||
let text = dedup_repeated_phrases(&text);
|
let text = dedup_repeated_phrases(&text);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -146,6 +146,45 @@ pub(crate) fn strip_leaked_js(input: &str) -> String {
|
||||||
out
|
out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Accessibility link chrome ("opens new tab", "external link")
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Strip screen-reader-only link chrome that bleeds into rendered text.
|
||||||
|
///
|
||||||
|
/// Sites like Reuters wrap external/new-window links with hidden spans
|
||||||
|
/// like `<span class="visually-hidden">, opens new tab</span>`. The noise
|
||||||
|
/// filter can't reliably catch these (no consistent class hook across
|
||||||
|
/// sites), so they end up duplicated all over the body text. This is a
|
||||||
|
/// targeted text-level scrub of the most common phrasings.
|
||||||
|
pub(crate) fn strip_a11y_link_chrome(input: &str) -> String {
|
||||||
|
static A11Y_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
||||||
|
Regex::new(
|
||||||
|
r"(?i)(?:\s*,\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?|\s+\((?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\)\.?|\s+external link\b\.?$)",
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut out = String::with_capacity(input.len());
|
||||||
|
let mut in_code_fence = false;
|
||||||
|
for (i, line) in input.lines().enumerate() {
|
||||||
|
if i > 0 {
|
||||||
|
out.push('\n');
|
||||||
|
}
|
||||||
|
if line.trim().starts_with("```") {
|
||||||
|
in_code_fence = !in_code_fence;
|
||||||
|
out.push_str(line);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if in_code_fence {
|
||||||
|
out.push_str(line);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
out.push_str(&A11Y_PATTERN.replace_all(line, ""));
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Spaced-out text collapsing (CSS animation artifacts)
|
// Spaced-out text collapsing (CSS animation artifacts)
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
@ -346,16 +385,33 @@ pub(crate) fn is_ui_control_line(line: &str) -> bool {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Split by whitespace: every token must be a known UI control
|
// Split by whitespace: every token must be a known UI control, with short
|
||||||
|
// numbers allowed only when paired with real pagination chrome.
|
||||||
let tokens: Vec<&str> = trimmed.split_whitespace().collect();
|
let tokens: Vec<&str> = trimmed.split_whitespace().collect();
|
||||||
if tokens.is_empty() {
|
if tokens.is_empty() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
tokens.iter().all(|t| is_ui_control_token(t))
|
|
||||||
|
let mut has_named_control = false;
|
||||||
|
for token in tokens {
|
||||||
|
if is_bare_short_integer(token) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if is_ui_control_token(token) {
|
||||||
|
has_named_control = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
has_named_control
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Known UI control tokens from Material Icons ligatures, icon fonts, and
|
/// Known UI control tokens from Material Icons ligatures, icon fonts, and
|
||||||
/// common navigation elements that leak into text extraction.
|
/// common navigation elements that leak into text extraction.
|
||||||
|
///
|
||||||
|
/// Match is case-insensitive: `Next`, `next`, and `NEXT` are all treated as
|
||||||
|
/// pagination chrome when alone on a line.
|
||||||
fn is_ui_control_token(token: &str) -> bool {
|
fn is_ui_control_token(token: &str) -> bool {
|
||||||
const UI_CONTROLS: &[&str] = &[
|
const UI_CONTROLS: &[&str] = &[
|
||||||
// Material Icons ligatures
|
// Material Icons ligatures
|
||||||
|
|
@ -389,6 +445,12 @@ fn is_ui_control_token(token: &str) -> bool {
|
||||||
"search",
|
"search",
|
||||||
"menu",
|
"menu",
|
||||||
"share",
|
"share",
|
||||||
|
// Pagination chrome left over from rendered "Next | Previous" links.
|
||||||
|
"next",
|
||||||
|
"previous",
|
||||||
|
"prev",
|
||||||
|
"older",
|
||||||
|
"newer",
|
||||||
// Arrow/nav characters
|
// Arrow/nav characters
|
||||||
"\u{2190}",
|
"\u{2190}",
|
||||||
"\u{2192}",
|
"\u{2192}",
|
||||||
|
|
@ -405,7 +467,56 @@ fn is_ui_control_token(token: &str) -> bool {
|
||||||
"\u{00BB}",
|
"\u{00BB}",
|
||||||
"\u{00AB}",
|
"\u{00AB}",
|
||||||
];
|
];
|
||||||
UI_CONTROLS.contains(&token)
|
let lowered = token.to_ascii_lowercase();
|
||||||
|
UI_CONTROLS.contains(&lowered.as_str())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Remove lines that are a bare short integer alone in their paragraph.
|
||||||
|
///
|
||||||
|
/// News index pages often render comment counts (`0`, `42`) and pagination
|
||||||
|
/// page numbers (`1`, `2`) as standalone paragraphs after each article. These
|
||||||
|
/// add zero signal and confuse downstream readers, but they are real numbers
|
||||||
|
/// not control tokens, so [`strip_ui_control_text`] does not catch them.
|
||||||
|
///
|
||||||
|
/// To stay safe, we only drop a line if both conditions hold:
|
||||||
|
/// 1. The trimmed line is a non-negative integer <= 9999.
|
||||||
|
/// 2. The line is alone in its paragraph, surrounded by blank lines or edges.
|
||||||
|
pub(crate) fn strip_bare_number_lines(input: &str) -> String {
|
||||||
|
let lines: Vec<&str> = input.lines().collect();
|
||||||
|
let mut out: Vec<&str> = Vec::with_capacity(lines.len());
|
||||||
|
let mut in_code = false;
|
||||||
|
|
||||||
|
for (i, line) in lines.iter().enumerate() {
|
||||||
|
let trimmed = line.trim();
|
||||||
|
if trimmed.starts_with("```") {
|
||||||
|
in_code = !in_code;
|
||||||
|
out.push(line);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if in_code {
|
||||||
|
out.push(line);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if is_bare_short_integer(trimmed) && is_isolated_in_paragraph(&lines, i) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
out.push(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
out.join("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_bare_short_integer(s: &str) -> bool {
|
||||||
|
if s.is_empty() || s.len() > 4 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
s.chars().all(|c| c.is_ascii_digit())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_isolated_in_paragraph(lines: &[&str], i: usize) -> bool {
|
||||||
|
let prev_blank = i == 0 || lines[i - 1].trim().is_empty();
|
||||||
|
let next_blank = i + 1 == lines.len() || lines[i + 1].trim().is_empty();
|
||||||
|
prev_blank && next_blank
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
@ -1119,6 +1230,37 @@ mod tests {
|
||||||
assert_eq!(strip_ui_control_text(input), "Hello\nWorld");
|
assert_eq!(strip_ui_control_text(input), "Hello\nWorld");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn ui_control_strips_pagination_with_comment_count() {
|
||||||
|
assert!(is_ui_control_line("0 Next"));
|
||||||
|
assert!(is_ui_control_line("12 PREVIOUS"));
|
||||||
|
assert_eq!(strip_ui_control_text("Story\n0 Next\nMore"), "Story\nMore");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn ui_control_keeps_bare_numbers_for_context() {
|
||||||
|
assert!(!is_ui_control_line("2026"));
|
||||||
|
assert_eq!(
|
||||||
|
strip_ui_control_text("Revenue\n2026\nReport"),
|
||||||
|
"Revenue\n2026\nReport"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bare_number_lines_strip_isolated_counts() {
|
||||||
|
let input = "Article title\n\n0\n\nNext article";
|
||||||
|
assert_eq!(
|
||||||
|
strip_bare_number_lines(input),
|
||||||
|
"Article title\n\n\nNext article"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bare_number_lines_keep_lists_and_code() {
|
||||||
|
let input = "- 1\n\n1.\n\n```\n0\n```\n\nReal text";
|
||||||
|
assert_eq!(strip_bare_number_lines(input), input);
|
||||||
|
}
|
||||||
|
|
||||||
// -- Long alt-text descriptions --
|
// -- Long alt-text descriptions --
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
@ -1356,4 +1498,48 @@ mod tests {
|
||||||
let input = "```\nImage of something in code\n```";
|
let input = "```\nImage of something in code\n```";
|
||||||
assert_eq!(strip_alt_text_noise(input), input);
|
assert_eq!(strip_alt_text_noise(input), input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn a11y_strips_opens_new_tab() {
|
||||||
|
let input = "Download the App, opens new tab and Subscribe, opens new tab.";
|
||||||
|
let out = strip_a11y_link_chrome(input);
|
||||||
|
assert!(!out.to_lowercase().contains("opens new tab"), "leak: {out}");
|
||||||
|
assert!(out.contains("Download the App"));
|
||||||
|
assert!(out.contains("Subscribe"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn a11y_strips_external_link_variants() {
|
||||||
|
let cases = [
|
||||||
|
("Visit our docs, opens external link", "Visit our docs"),
|
||||||
|
("Click here, opens in a new window.", "Click here"),
|
||||||
|
("More info external link", "More info"),
|
||||||
|
];
|
||||||
|
for (input, expected_prefix) in cases {
|
||||||
|
let out = strip_a11y_link_chrome(input);
|
||||||
|
assert!(
|
||||||
|
out.starts_with(expected_prefix),
|
||||||
|
"input={input:?} got={out:?}"
|
||||||
|
);
|
||||||
|
assert!(!out.to_lowercase().contains("opens"), "leak: {out}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn a11y_preserves_code_blocks() {
|
||||||
|
let input = "```\nopens new tab is a function\n```\nDownload, opens new tab";
|
||||||
|
let out = strip_a11y_link_chrome(input);
|
||||||
|
assert!(
|
||||||
|
out.contains("opens new tab is a function"),
|
||||||
|
"code stripped: {out}"
|
||||||
|
);
|
||||||
|
// Outside the fence, the chrome is removed.
|
||||||
|
assert!(!out.to_lowercase().contains("download, opens new tab"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn a11y_preserves_external_link_prose() {
|
||||||
|
let input = "Researchers found an external link between the two incidents.";
|
||||||
|
assert_eq!(strip_a11y_link_chrome(input), input);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -69,6 +69,18 @@ fn is_noise_link(text: &str, href: &str) -> bool {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Bare integer labels are usually comment counts, vote counts, or page
|
||||||
|
// numbers. The label alone carries no useful link context for an LLM.
|
||||||
|
if !text.is_empty() && text.len() <= 4 && text.chars().all(|c| c.is_ascii_digit()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// In-page comment/discussion fragments that survived the bare-fragment
|
||||||
|
// check because the href is a full URL with a comment fragment.
|
||||||
|
if href.contains("#comment-stream") || href.contains("#comments") || href.contains("#disqus") {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// Internal user profile / action URLs (HN-style)
|
// Internal user profile / action URLs (HN-style)
|
||||||
if href.contains("/user?id=")
|
if href.contains("/user?id=")
|
||||||
|| href.contains("/hide?id=")
|
|| href.contains("/hide?id=")
|
||||||
|
|
@ -88,10 +100,19 @@ fn is_noise_link(text: &str, href: &str) -> bool {
|
||||||
static MD_MARKERS_RE: Lazy<Regex> =
|
static MD_MARKERS_RE: Lazy<Regex> =
|
||||||
Lazy::new(|| Regex::new(r"#{1,6}\s+|\*{1,2}|_{1,2}|`").unwrap());
|
Lazy::new(|| Regex::new(r"#{1,6}\s+|\*{1,2}|_{1,2}|`").unwrap());
|
||||||
|
|
||||||
|
static A11Y_LABEL_RE: Lazy<Regex> = Lazy::new(|| {
|
||||||
|
Regex::new(
|
||||||
|
r"(?i)(?:\s*,?\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website))\b\.?|\s*,\s*external link\b\.?|\s+external link\b\.?$)",
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
/// Clean a link label: strip markdown, dedup repeated phrases, truncate.
|
/// Clean a link label: strip markdown, dedup repeated phrases, truncate.
|
||||||
pub(crate) fn clean_link_label(raw: &str) -> String {
|
pub(crate) fn clean_link_label(raw: &str) -> String {
|
||||||
// Strip markdown markers
|
// Strip markdown markers
|
||||||
let label = MD_MARKERS_RE.replace_all(raw, "").to_string();
|
let label = MD_MARKERS_RE.replace_all(raw, "").to_string();
|
||||||
|
// Strip a11y link chrome ("opens new tab", etc.)
|
||||||
|
let label = A11Y_LABEL_RE.replace_all(&label, "").to_string();
|
||||||
let label = label.split_whitespace().collect::<Vec<_>>().join(" ");
|
let label = label.split_whitespace().collect::<Vec<_>>().join(" ");
|
||||||
|
|
||||||
// Dedup repeated phrases in label
|
// Dedup repeated phrases in label
|
||||||
|
|
@ -181,4 +202,20 @@ mod tests {
|
||||||
assert!(is_noise_link("user", "https://hn.com/user?id=foo"));
|
assert!(is_noise_link("user", "https://hn.com/user?id=foo"));
|
||||||
assert!(!is_noise_link("Rust docs", "https://rust-lang.org"));
|
assert!(!is_noise_link("Rust docs", "https://rust-lang.org"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn link_label_preserves_external_link_prose() {
|
||||||
|
assert_eq!(
|
||||||
|
clean_link_label("Research found an external link between incidents"),
|
||||||
|
"Research found an external link between incidents"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn link_label_strips_terminal_external_link_chrome() {
|
||||||
|
assert_eq!(
|
||||||
|
clean_link_label("Reuters story external link"),
|
||||||
|
"Reuters story"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -46,15 +46,119 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
||||||
}
|
}
|
||||||
|
|
||||||
// -- 4. Structured data (NEXT_DATA, SvelteKit, JSON-LD) --
|
// -- 4. Structured data (NEXT_DATA, SvelteKit, JSON-LD) --
|
||||||
if !result.structured_data.is_empty() {
|
// Only emit useful items: Schema.org records with a meaningful @type,
|
||||||
out.push_str("\n\n## Structured Data\n\n```json\n");
|
// and only if the total serialized size stays under a budget. Framework
|
||||||
out.push_str(&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default());
|
// hydration blobs (Next.js pageProps full of ad-targeting flags, build
|
||||||
out.push_str("\n```");
|
// IDs, schedule paths) explode to hundreds of KB and drown the LLM in
|
||||||
|
// noise — drop them rather than ship them.
|
||||||
|
let mut useful: Vec<_> = result
|
||||||
|
.structured_data
|
||||||
|
.iter()
|
||||||
|
.filter(|v| is_useful_structured_data(v))
|
||||||
|
.cloned()
|
||||||
|
.collect();
|
||||||
|
for value in &mut useful {
|
||||||
|
scrub_body_fields(value, 0);
|
||||||
|
}
|
||||||
|
if !useful.is_empty() {
|
||||||
|
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
|
||||||
|
const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
|
||||||
|
if serialized.len() <= STRUCTURED_DATA_MAX_BYTES {
|
||||||
|
out.push_str("\n\n## Structured Data\n\n```json\n");
|
||||||
|
out.push_str(&serialized);
|
||||||
|
out.push_str("\n```");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
out.trim().to_string()
|
out.trim().to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Decide whether a structured-data value carries content worth emitting.
|
||||||
|
///
|
||||||
|
/// Schema.org records with a recognizable content `@type` (Article, NewsArticle,
|
||||||
|
/// Product, Recipe, FAQPage, HowTo, Event, Person, Organization, BreadcrumbList,
|
||||||
|
/// VideoObject, JobPosting, etc.) are kept. Generic `WebSite` / `WebPage` /
|
||||||
|
/// `ItemList` records and Next.js `pageProps`-style blobs without a useful
|
||||||
|
/// `@type` are dropped — they're almost always navigation chrome or framework
|
||||||
|
/// hydration state.
|
||||||
|
fn is_useful_structured_data(v: &serde_json::Value) -> bool {
|
||||||
|
let Some(obj) = v.as_object() else {
|
||||||
|
// SvelteKit can emit compact arrays of page data. Keep those if they
|
||||||
|
// are small enough to be useful, while still dropping giant hydration
|
||||||
|
// arrays under the same budget as untyped objects.
|
||||||
|
if v.is_array() {
|
||||||
|
let serialized = serde_json::to_string(v).unwrap_or_default();
|
||||||
|
return serialized.len() <= 4 * 1024;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
// JSON-LD: @type drives the decision.
|
||||||
|
if let Some(t) = obj.get("@type") {
|
||||||
|
let types: Vec<String> = match t {
|
||||||
|
serde_json::Value::String(s) => vec![s.to_ascii_lowercase()],
|
||||||
|
serde_json::Value::Array(a) => a
|
||||||
|
.iter()
|
||||||
|
.filter_map(|x| x.as_str())
|
||||||
|
.map(str::to_ascii_lowercase)
|
||||||
|
.collect(),
|
||||||
|
_ => Vec::new(),
|
||||||
|
};
|
||||||
|
if types.is_empty() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Drop low-info chrome types.
|
||||||
|
const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"];
|
||||||
|
return types.iter().any(|t| !DROP_TYPES.iter().any(|d| t == d));
|
||||||
|
}
|
||||||
|
// Next.js pageProps / SvelteKit data without @type: keep only if compact.
|
||||||
|
// Anything over ~4KB is almost certainly hydration state, not content.
|
||||||
|
let serialized = serde_json::to_string(v).unwrap_or_default();
|
||||||
|
serialized.len() <= 4 * 1024
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Recursively remove long fields that duplicate the rendered markdown body.
|
||||||
|
///
|
||||||
|
/// `depth` guards against stack exhaustion from attacker-controlled
|
||||||
|
/// JSON-LD / `__NEXT_DATA__` blobs with pathological nesting: past
|
||||||
|
/// [`MAX_SCRUB_DEPTH`] levels we stop descending and leave the subtree
|
||||||
|
/// as-is (it is still size-capped by the `STRUCTURED_DATA_MAX_BYTES`
|
||||||
|
/// budget in `to_llm_text`).
|
||||||
|
fn scrub_body_fields(v: &mut serde_json::Value, depth: usize) {
|
||||||
|
const BODY_KEYS: &[&str] = &["articleBody"];
|
||||||
|
const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
|
||||||
|
const LONG_THRESHOLD: usize = 500;
|
||||||
|
const MAX_SCRUB_DEPTH: usize = 64;
|
||||||
|
|
||||||
|
if depth >= MAX_SCRUB_DEPTH {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
match v {
|
||||||
|
serde_json::Value::Object(map) => {
|
||||||
|
map.retain(|key, value| {
|
||||||
|
if BODY_KEYS.contains(&key.as_str()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if LONG_BODY_KEYS.contains(&key.as_str())
|
||||||
|
&& value.as_str().is_some_and(|s| s.len() >= LONG_THRESHOLD)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
true
|
||||||
|
});
|
||||||
|
for value in map.values_mut() {
|
||||||
|
scrub_body_fields(value, depth + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
serde_json::Value::Array(values) => {
|
||||||
|
for value in values {
|
||||||
|
scrub_body_fields(value, depth + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Integration tests that exercise the full pipeline through to_llm_text
|
// Integration tests that exercise the full pipeline through to_llm_text
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
@ -700,4 +804,168 @@ mod tests {
|
||||||
assert!(out.contains("Some content"), "Content before lost: {out}");
|
assert!(out.contains("Some content"), "Content before lost: {out}");
|
||||||
assert!(out.contains("More content"), "Content after lost: {out}");
|
assert!(out.contains("More content"), "Content after lost: {out}");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -- Structured-data gating tests --
|
||||||
|
|
||||||
|
fn make_result_with_structured(values: Vec<serde_json::Value>) -> ExtractionResult {
|
||||||
|
let mut r = make_result("# Body");
|
||||||
|
r.structured_data = values;
|
||||||
|
r
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn structured_data_drops_chrome_types() {
|
||||||
|
// WebSite/WebPage records are framework chrome — should be dropped.
|
||||||
|
let r = make_result_with_structured(vec![serde_json::json!({
|
||||||
|
"@type": "WebSite",
|
||||||
|
"name": "Example",
|
||||||
|
"url": "https://example.com"
|
||||||
|
})]);
|
||||||
|
let out = to_llm_text(&r, None);
|
||||||
|
assert!(
|
||||||
|
!out.contains("## Structured Data"),
|
||||||
|
"WebSite chrome leaked into output: {out}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn structured_data_keeps_article_types() {
|
||||||
|
let r = make_result_with_structured(vec![serde_json::json!({
|
||||||
|
"@type": "NewsArticle",
|
||||||
|
"headline": "Big news",
|
||||||
|
"datePublished": "2026-05-10"
|
||||||
|
})]);
|
||||||
|
let out = to_llm_text(&r, None);
|
||||||
|
assert!(
|
||||||
|
out.contains("## Structured Data"),
|
||||||
|
"NewsArticle dropped: {out}"
|
||||||
|
);
|
||||||
|
assert!(out.contains("Big news"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn structured_data_scrubs_duplicate_article_body() {
|
||||||
|
let body = "This is the rendered article body. ".repeat(40);
|
||||||
|
let r = make_result_with_structured(vec![serde_json::json!({
|
||||||
|
"@type": "NewsArticle",
|
||||||
|
"headline": "Big news",
|
||||||
|
"articleBody": body,
|
||||||
|
"description": "A short useful summary"
|
||||||
|
})]);
|
||||||
|
let out = to_llm_text(&r, None);
|
||||||
|
assert!(out.contains("Big news"));
|
||||||
|
assert!(out.contains("A short useful summary"));
|
||||||
|
assert!(
|
||||||
|
!out.contains("articleBody"),
|
||||||
|
"Duplicate article body leaked: {out}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn llm_output_strips_comment_count_links_and_pagination() {
|
||||||
|
let md = "Lead paragraph.\n\n[0](https://example.com/#comment-stream) Next\n\n5 minutes read\n\n[Article](https://example.com/article)";
|
||||||
|
let result = make_result(md);
|
||||||
|
let out = to_llm_text(&result, None);
|
||||||
|
assert!(out.contains("Lead paragraph."));
|
||||||
|
assert!(out.contains("5 minutes read"));
|
||||||
|
assert!(out.contains("- Article: https://example.com/article"));
|
||||||
|
assert!(!out.contains("0 Next"), "Pagination leaked: {out}");
|
||||||
|
assert!(
|
||||||
|
!out.contains("comment-stream"),
|
||||||
|
"Comment link leaked: {out}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn structured_data_drops_oversized_blob() {
|
||||||
|
// 32KB pageProps-style blob with no @type — should be dropped.
|
||||||
|
let big = "x".repeat(32 * 1024);
|
||||||
|
let r = make_result_with_structured(vec![serde_json::json!({
|
||||||
|
"buildId": "abc",
|
||||||
|
"isFallback": false,
|
||||||
|
"noise": big
|
||||||
|
})]);
|
||||||
|
let out = to_llm_text(&r, None);
|
||||||
|
assert!(
|
||||||
|
!out.contains("## Structured Data"),
|
||||||
|
"Oversized untyped blob leaked: len={}",
|
||||||
|
out.len()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn structured_data_keeps_compact_untyped() {
|
||||||
|
// Small untyped record (e.g. a parsed pageProps with real content) — keep.
|
||||||
|
let r = make_result_with_structured(vec![serde_json::json!({
|
||||||
|
"title": "Hi",
|
||||||
|
"body": "small enough to keep"
|
||||||
|
})]);
|
||||||
|
let out = to_llm_text(&r, None);
|
||||||
|
assert!(
|
||||||
|
out.contains("## Structured Data"),
|
||||||
|
"Compact untyped dropped: {out}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn structured_data_keeps_compact_untyped_array() {
|
||||||
|
// SvelteKit can emit compact arrays rather than objects.
|
||||||
|
let r = make_result_with_structured(vec![serde_json::json!([
|
||||||
|
{ "title": "Hi", "body": "small array item" }
|
||||||
|
])]);
|
||||||
|
let out = to_llm_text(&r, None);
|
||||||
|
assert!(
|
||||||
|
out.contains("small array item"),
|
||||||
|
"Compact untyped array dropped: {out}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Walk `value` down its single `"n"` child link and return the depth
|
||||||
|
/// at which an `articleBody` key is still present (i.e. was NOT
|
||||||
|
/// scrubbed). Used to observe exactly where the recursion stopped.
|
||||||
|
fn first_unscrubbed_article_body_depth(mut value: &serde_json::Value) -> Option<usize> {
|
||||||
|
let mut depth = 0;
|
||||||
|
loop {
|
||||||
|
let obj = value.as_object()?;
|
||||||
|
if obj.contains_key("articleBody") {
|
||||||
|
return Some(depth);
|
||||||
|
}
|
||||||
|
value = obj.get("n")?;
|
||||||
|
depth += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn scrub_body_fields_bounds_recursion_on_deep_nesting() {
|
||||||
|
// Attacker-controlled JSON-LD / __NEXT_DATA__ with pathological
|
||||||
|
// nesting must not recurse without bound. Build a chain a little
|
||||||
|
// past the 64-level cap where every level carries a scrub-able
|
||||||
|
// `articleBody`. Levels within the cap get scrubbed; the first
|
||||||
|
// level past the cap keeps its `articleBody` because recursion
|
||||||
|
// stopped — that is the bound we assert. (Kept shallow on purpose:
|
||||||
|
// serde_json drops Values recursively, so a 10k-deep value would
|
||||||
|
// overflow the stack just being dropped.)
|
||||||
|
const DEPTH: usize = 80;
|
||||||
|
let mut node = serde_json::json!({ "articleBody": "x".repeat(600) });
|
||||||
|
for _ in 0..DEPTH {
|
||||||
|
node = serde_json::json!({
|
||||||
|
"articleBody": "x".repeat(600),
|
||||||
|
"n": node,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
scrub_body_fields(&mut node, 0);
|
||||||
|
|
||||||
|
let stopped_at = first_unscrubbed_article_body_depth(&node)
|
||||||
|
.expect("recursion must stop and leave a deep articleBody intact");
|
||||||
|
// Top levels were scrubbed; the survivor sits right at the cap.
|
||||||
|
assert_eq!(
|
||||||
|
stopped_at, 64,
|
||||||
|
"recursion should stop at the depth cap, stopped at {stopped_at}"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
node.as_object().unwrap().get("articleBody").is_none(),
|
||||||
|
"shallow articleBody must still be scrubbed"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -320,6 +320,9 @@ fn children_to_md(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Node::Text(text) => {
|
Node::Text(text) => {
|
||||||
|
if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) {
|
||||||
|
out.push(' ');
|
||||||
|
}
|
||||||
out.push_str(text);
|
out.push_str(text);
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
|
|
@ -350,6 +353,9 @@ fn inline_text(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Node::Text(text) => {
|
Node::Text(text) => {
|
||||||
|
if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) {
|
||||||
|
out.push(' ');
|
||||||
|
}
|
||||||
out.push_str(text);
|
out.push_str(text);
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
|
|
@ -361,11 +367,65 @@ fn inline_text(
|
||||||
|
|
||||||
/// Check whether a space is needed between two adjacent chunks of output.
|
/// Check whether a space is needed between two adjacent chunks of output.
|
||||||
/// Returns true when the left side doesn't end with whitespace and the right
|
/// Returns true when the left side doesn't end with whitespace and the right
|
||||||
/// side doesn't start with whitespace — i.e., two words would be mashed together.
|
/// side doesn't start with whitespace, except around punctuation that should
|
||||||
|
/// bind to the adjacent token.
|
||||||
fn needs_separator(left: &str, right: &str) -> bool {
|
fn needs_separator(left: &str, right: &str) -> bool {
|
||||||
let l = left.as_bytes().last().copied().unwrap_or(b' ');
|
let l = left.chars().next_back().unwrap_or(' ');
|
||||||
let r = right.as_bytes().first().copied().unwrap_or(b' ');
|
let r = right.chars().next().unwrap_or(' ');
|
||||||
!l.is_ascii_whitespace() && !r.is_ascii_whitespace()
|
|
||||||
|
if l.is_whitespace() || r.is_whitespace() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do not create "word ," / "word )" / "word 's" artifacts.
|
||||||
|
if is_closing_punctuation(r) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do not create "( word" / "[ 1" artifacts.
|
||||||
|
if is_opening_punctuation(l) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Common inline-code suffixes: `Option`s, `x`'s. Treat them like a
|
||||||
|
// single token rather than separating the text node.
|
||||||
|
if matches!(l, '`' | ')') && starts_with_inline_code_suffix(right) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn starts_with_inline_code_suffix(s: &str) -> bool {
|
||||||
|
let trimmed = s.trim_start_matches(['*', '_']);
|
||||||
|
let mut chars = trimmed.chars();
|
||||||
|
let Some(first) = chars.next() else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
if matches!(first, '\'' | '’') {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if !matches!(first, 's' | 'S') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
match chars.next() {
|
||||||
|
None => true,
|
||||||
|
Some(c) => c.is_whitespace() || is_closing_punctuation(c) || matches!(c, '*' | '_'),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_closing_punctuation(c: char) -> bool {
|
||||||
|
matches!(
|
||||||
|
c,
|
||||||
|
'.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '’' | '"' | '”'
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_opening_punctuation(c: char) -> bool {
|
||||||
|
matches!(c, '(' | '[' | '{' | '"' | '“')
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Collect raw text content (no markdown formatting).
|
/// Collect raw text content (no markdown formatting).
|
||||||
|
|
@ -920,8 +980,10 @@ fn strip_markdown(md: &str) -> String {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs
|
// Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs.
|
||||||
if trimmed.starts_with('|') && trimmed.ends_with('|') {
|
// Require at least 2 chars so the slice `[1..len-1]` stays non-empty on single-pipe rows
|
||||||
|
// (which aren't real tables anyway); a lone `|` previously panicked at `begin <= end`.
|
||||||
|
if trimmed.len() >= 2 && trimmed.starts_with('|') && trimmed.ends_with('|') {
|
||||||
let inner = &trimmed[1..trimmed.len() - 1];
|
let inner = &trimmed[1..trimmed.len() - 1];
|
||||||
let cells: Vec<&str> = inner.split('|').map(|c| c.trim()).collect();
|
let cells: Vec<&str> = inner.split('|').map(|c| c.trim()).collect();
|
||||||
lines.push(cells.join("\t"));
|
lines.push(cells.join("\t"));
|
||||||
|
|
@ -1604,4 +1666,39 @@ mod tests {
|
||||||
"collapse_whitespace stripped 6-space indent: {output}"
|
"collapse_whitespace stripped 6-space indent: {output}"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn text_after_inline_element_keeps_separator() {
|
||||||
|
// Reuters-style markup: <a><time>3h</time>ago</a><a>Tanker crosses...</a>
|
||||||
|
// The "ago" text node sits between two element children. Without a
|
||||||
|
// separator check on the Text branch, "ago" + "Tanker" would smash
|
||||||
|
// together as "agoTanker".
|
||||||
|
let html = r#"<div><span>3h</span>ago<span>Tanker crosses Strait</span></div>"#;
|
||||||
|
let (md, _, _) = convert_html(html, None);
|
||||||
|
assert!(
|
||||||
|
!md.contains("agoTanker"),
|
||||||
|
"Element->Text->Element smashed together: {md}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn punctuation_after_inline_element_stays_attached() {
|
||||||
|
let html = r#"<p><span>Hello</span>, world. Use <code>package.json</code>.</p>"#;
|
||||||
|
let (md, _, _) = convert_html(html, None);
|
||||||
|
assert!(md.contains("Hello, world"), "punctuation detached: {md}");
|
||||||
|
assert!(
|
||||||
|
md.contains("`package.json`."),
|
||||||
|
"code punctuation detached: {md}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn inline_code_suffix_stays_attached() {
|
||||||
|
let html = r#"<p><a href="https://example.com"><code>NullPointerException</code></a><em>s</em> are common.</p>"#;
|
||||||
|
let (md, _, _) = convert_html(html, None);
|
||||||
|
assert!(
|
||||||
|
md.contains("[`NullPointerException`](https://example.com)*s* are common"),
|
||||||
|
"code suffix detached: {md}"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
968
crates/webclaw-core/src/reddit.rs
Normal file
968
crates/webclaw-core/src/reddit.rs
Normal file
|
|
@ -0,0 +1,968 @@
|
||||||
|
//! Reddit thread extractor — parses old.reddit.com HTML directly.
|
||||||
|
//!
|
||||||
|
//! old.reddit.com serves fully server-rendered HTML with stable class names
|
||||||
|
//! and data attributes. No JS, no API key, no `.json` trick needed.
|
||||||
|
|
||||||
|
use scraper::{ElementRef, Html, Selector};
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
use crate::{Content, DomainData, DomainType, ExtractionResult, Metadata};
|
||||||
|
|
||||||
|
// ─── Public types ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct RedditPost {
|
||||||
|
pub id: Option<String>,
|
||||||
|
pub title: String,
|
||||||
|
pub author: String,
|
||||||
|
pub subreddit: Option<String>,
|
||||||
|
pub score: i64,
|
||||||
|
pub body: Option<String>,
|
||||||
|
pub num_comments: usize,
|
||||||
|
pub permalink: String,
|
||||||
|
pub url: Option<String>,
|
||||||
|
pub is_self: bool,
|
||||||
|
pub flair: Option<String>,
|
||||||
|
pub created_utc: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct RedditComment {
|
||||||
|
pub id: Option<String>,
|
||||||
|
pub author: String,
|
||||||
|
pub body: String,
|
||||||
|
/// `None` when Reddit hides the score (fresh comments). Distinct from
|
||||||
|
/// `Some(0)`, which is a real net-zero score.
|
||||||
|
pub score: Option<i64>,
|
||||||
|
pub depth: usize,
|
||||||
|
pub is_op: bool,
|
||||||
|
pub created_utc: Option<String>,
|
||||||
|
pub replies: Vec<RedditComment>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct RedditThread {
|
||||||
|
#[serde(rename = "url")]
|
||||||
|
pub source_url: String,
|
||||||
|
pub post: Option<RedditPost>,
|
||||||
|
pub comments: Vec<RedditComment>,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Public API ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub fn is_reddit_url(url: &str) -> bool {
|
||||||
|
matches!(
|
||||||
|
host_of(url),
|
||||||
|
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Try to parse a Reddit thread from old.reddit.com HTML.
|
||||||
|
/// Returns `None` if the page doesn't have recognisable Reddit structure.
|
||||||
|
pub fn try_extract_thread(html: &str, url: &str) -> Option<RedditThread> {
|
||||||
|
if !url.contains("/comments/") {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let doc = Html::parse_document(html);
|
||||||
|
let post = parse_post(&doc);
|
||||||
|
let op = post.as_ref().map(|p| p.author.as_str()).unwrap_or("");
|
||||||
|
let comments = parse_comments(&doc, op);
|
||||||
|
|
||||||
|
if post.is_none() && comments.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(RedditThread {
|
||||||
|
source_url: url.to_string(),
|
||||||
|
post,
|
||||||
|
comments,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Entry point for `webclaw-core`'s extraction fast path.
|
||||||
|
pub fn try_extract(html: &str, url: &str) -> Option<ExtractionResult> {
|
||||||
|
let thread = try_extract_thread(html, url)?;
|
||||||
|
Some(to_extraction_result(&thread))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── ExtractionResult builder ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
fn to_extraction_result(thread: &RedditThread) -> ExtractionResult {
|
||||||
|
let md = to_markdown(thread);
|
||||||
|
let plain = plain_text(&md);
|
||||||
|
let wc = md.split_whitespace().count();
|
||||||
|
|
||||||
|
let (title, author, site_name) = thread
|
||||||
|
.post
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| {
|
||||||
|
(
|
||||||
|
Some(p.title.clone()),
|
||||||
|
Some(p.author.clone()),
|
||||||
|
p.subreddit.clone(),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
ExtractionResult {
|
||||||
|
metadata: Metadata {
|
||||||
|
title,
|
||||||
|
description: None,
|
||||||
|
author,
|
||||||
|
published_date: None,
|
||||||
|
language: Some("en".to_string()),
|
||||||
|
url: Some(thread.source_url.clone()),
|
||||||
|
site_name,
|
||||||
|
image: None,
|
||||||
|
favicon: None,
|
||||||
|
word_count: wc,
|
||||||
|
},
|
||||||
|
content: Content {
|
||||||
|
markdown: md,
|
||||||
|
plain_text: plain,
|
||||||
|
links: vec![],
|
||||||
|
images: vec![],
|
||||||
|
code_blocks: vec![],
|
||||||
|
raw_html: None,
|
||||||
|
},
|
||||||
|
domain_data: Some(DomainData {
|
||||||
|
domain_type: DomainType::Social,
|
||||||
|
}),
|
||||||
|
structured_data: vec![],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Markdown rendering ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub fn to_markdown(thread: &RedditThread) -> String {
|
||||||
|
let mut out = String::new();
|
||||||
|
|
||||||
|
if let Some(p) = &thread.post {
|
||||||
|
out.push_str(&format!("# {}\n\n", p.title));
|
||||||
|
|
||||||
|
let pts = pt_label(Some(p.score));
|
||||||
|
let cmt = match p.num_comments {
|
||||||
|
0 => String::new(),
|
||||||
|
1 => " · 1 comment".to_string(),
|
||||||
|
n => format!(" · {n} comments"),
|
||||||
|
};
|
||||||
|
let sub = p.subreddit.as_deref().unwrap_or("?");
|
||||||
|
out.push_str(&format!("**u/{}** · r/{sub} · {pts}{cmt}\n\n", p.author));
|
||||||
|
|
||||||
|
if let Some(ref body) = p.body
|
||||||
|
&& !body.is_empty()
|
||||||
|
{
|
||||||
|
out.push_str(body);
|
||||||
|
out.push_str("\n\n");
|
||||||
|
}
|
||||||
|
if let Some(ref link) = p.url
|
||||||
|
&& !p.is_self
|
||||||
|
{
|
||||||
|
out.push_str(&format!("[Link]({link})\n\n"));
|
||||||
|
}
|
||||||
|
out.push_str("---\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
if !thread.comments.is_empty() {
|
||||||
|
out.push_str("## Comments\n\n");
|
||||||
|
for c in &thread.comments {
|
||||||
|
render_comment(c, &mut out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
collapse_blank_lines(out.trim_end())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Render one comment + its replies. Nesting is expressed with blockquote
|
||||||
|
/// depth (`> ` per level) rather than leading spaces: space-indentation of
|
||||||
|
/// 4+ would turn ordinary text and ``` fences into CommonMark indented code
|
||||||
|
/// blocks, corrupting any comment at depth ≥ 2.
|
||||||
|
fn render_comment(c: &RedditComment, out: &mut String) {
|
||||||
|
let q = "> ".repeat(c.depth);
|
||||||
|
let blank = ">".repeat(c.depth);
|
||||||
|
let author = if c.is_op {
|
||||||
|
format!("**u/{} [OP]**", c.author)
|
||||||
|
} else {
|
||||||
|
format!("**u/{}**", c.author)
|
||||||
|
};
|
||||||
|
out.push_str(&format!("{q}{author} · {}\n", pt_label(c.score)));
|
||||||
|
for line in c.body.lines() {
|
||||||
|
if line.is_empty() {
|
||||||
|
out.push_str(&blank);
|
||||||
|
out.push('\n');
|
||||||
|
} else {
|
||||||
|
out.push_str(&q);
|
||||||
|
out.push_str(line);
|
||||||
|
out.push('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.push('\n');
|
||||||
|
for reply in &c.replies {
|
||||||
|
render_comment(reply, out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn pt_label(n: Option<i64>) -> String {
|
||||||
|
match n {
|
||||||
|
None => "score hidden".to_string(),
|
||||||
|
Some(1) => "1 pt".to_string(),
|
||||||
|
Some(-1) => "-1 pt".to_string(),
|
||||||
|
Some(n) => format!("{n} pts"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collapse runs of 3+ newlines down to a blank-line separator so the
|
||||||
|
/// blockquote prefixes and `<pre>` spacing don't leave large gaps.
|
||||||
|
fn collapse_blank_lines(s: &str) -> String {
|
||||||
|
let mut out = String::with_capacity(s.len());
|
||||||
|
let mut newlines = 0;
|
||||||
|
for ch in s.chars() {
|
||||||
|
if ch == '\n' {
|
||||||
|
newlines += 1;
|
||||||
|
if newlines <= 2 {
|
||||||
|
out.push(ch);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
newlines = 0;
|
||||||
|
out.push(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
fn plain_text(md: &str) -> String {
|
||||||
|
md.lines()
|
||||||
|
.map(|l| {
|
||||||
|
// Strip a single leading blockquote / heading marker, then drop
|
||||||
|
// emphasis markers. Greedy char-class stripping (the old approach)
|
||||||
|
// ate legitimate content like ">"-prefixed quotes.
|
||||||
|
let l = l.trim_start();
|
||||||
|
let l = l
|
||||||
|
.strip_prefix("> ")
|
||||||
|
.or_else(|| l.strip_prefix('>'))
|
||||||
|
.unwrap_or(l);
|
||||||
|
let l = l.trim_start_matches('#').trim_start();
|
||||||
|
l.replace("**", "")
|
||||||
|
.replace("~~", "")
|
||||||
|
.replace(['*', '`'], "")
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── HTML parsing ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
fn parse_post(doc: &Html) -> Option<RedditPost> {
|
||||||
|
let sel = Selector::parse("#siteTable .thing.link").ok()?;
|
||||||
|
let thing = doc.select(&sel).next()?;
|
||||||
|
let v = thing.value();
|
||||||
|
|
||||||
|
let id = v
|
||||||
|
.attr("data-fullname")
|
||||||
|
.map(|s| s.trim_start_matches("t3_").to_string());
|
||||||
|
let author = v.attr("data-author").unwrap_or("[deleted]").to_string();
|
||||||
|
let subreddit = v.attr("data-subreddit").map(str::to_string);
|
||||||
|
let score: i64 = v
|
||||||
|
.attr("data-score")
|
||||||
|
.and_then(|s| s.parse().ok())
|
||||||
|
.unwrap_or(0);
|
||||||
|
let num_comments: usize = v
|
||||||
|
.attr("data-comments-count")
|
||||||
|
.and_then(|s| s.parse().ok())
|
||||||
|
.unwrap_or(0);
|
||||||
|
let permalink_path = v.attr("data-permalink").unwrap_or("");
|
||||||
|
let permalink = format!("https://old.reddit.com{permalink_path}");
|
||||||
|
// Self-posts carry the `self` class and a `self.<sub>` domain; their
|
||||||
|
// data-url points back at the permalink rather than an external site.
|
||||||
|
let is_self = v.has_class("self", scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||||
|
|| v.attr("data-domain")
|
||||||
|
.is_some_and(|d| d.starts_with("self."));
|
||||||
|
let link_url = v.attr("data-url").map(str::to_string);
|
||||||
|
let url = if is_self { None } else { link_url };
|
||||||
|
|
||||||
|
// Title
|
||||||
|
let sel_title = Selector::parse(".title a.title").ok()?;
|
||||||
|
let title = thing
|
||||||
|
.select(&sel_title)
|
||||||
|
.next()
|
||||||
|
.map(|el| el.text().collect::<String>().trim().to_string())
|
||||||
|
.filter(|s| !s.is_empty())?;
|
||||||
|
|
||||||
|
// Flair
|
||||||
|
let flair = Selector::parse(".linkflairlabel")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| thing.select(&s).next())
|
||||||
|
.map(|el| el.text().collect::<String>().trim().to_string())
|
||||||
|
.filter(|s| !s.is_empty());
|
||||||
|
|
||||||
|
// Self-text body: thing > .entry > .expando > .usertext-body [> .md]
|
||||||
|
let body = direct_child(thing, "entry")
|
||||||
|
.and_then(|entry| find_class(entry, "expando"))
|
||||||
|
.and_then(|expando| find_class(expando, "usertext-body"))
|
||||||
|
.and_then(|ut| find_class(ut, "md"))
|
||||||
|
.map(md_to_markdown)
|
||||||
|
.filter(|s| !s.is_empty());
|
||||||
|
|
||||||
|
// Datetime
|
||||||
|
let created_utc = Selector::parse("time[datetime]")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| thing.select(&s).next())
|
||||||
|
.and_then(|t| t.value().attr("datetime"))
|
||||||
|
.map(str::to_string);
|
||||||
|
|
||||||
|
Some(RedditPost {
|
||||||
|
id,
|
||||||
|
title,
|
||||||
|
author,
|
||||||
|
subreddit,
|
||||||
|
score,
|
||||||
|
body,
|
||||||
|
num_comments,
|
||||||
|
permalink,
|
||||||
|
url,
|
||||||
|
is_self,
|
||||||
|
flair,
|
||||||
|
created_utc,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Comment parsing ───────────────────────────────────────────────────────────
|
||||||
|
//
|
||||||
|
// old.reddit.com nests comments structurally, not via a depth attribute:
|
||||||
|
//
|
||||||
|
// .commentarea
|
||||||
|
// .sitetable.nestedlisting
|
||||||
|
// .comment.thing ← root comment
|
||||||
|
// .entry → form → .usertext-body → .md ← its own body
|
||||||
|
// .child
|
||||||
|
// .sitetable.listing
|
||||||
|
// .comment.thing ← reply (recurse)
|
||||||
|
//
|
||||||
|
// `data-depth`/`data-replies` are absent or always "0" in the logged-out
|
||||||
|
// HTML, so we walk the tree by recursing into each comment's `.child`.
|
||||||
|
|
||||||
|
fn parse_comments(doc: &Html, op: &str) -> Vec<RedditComment> {
|
||||||
|
// Root listing is `.sitetable.nestedlisting` inside `.commentarea`
|
||||||
|
// (note: `commentarea` is a class on old.reddit, not an id). Fall back
|
||||||
|
// to the first `.nestedlisting` anywhere for comment-permalink pages.
|
||||||
|
let listing = Selector::parse(".commentarea .sitetable.nestedlisting")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| doc.select(&s).next())
|
||||||
|
.or_else(|| {
|
||||||
|
Selector::parse(".sitetable.nestedlisting")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| doc.select(&s).next())
|
||||||
|
});
|
||||||
|
|
||||||
|
match listing {
|
||||||
|
Some(l) => walk_comment_level(l, op, 0),
|
||||||
|
None => vec![],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse the direct-child `.comment.thing` elements of a comment listing.
|
||||||
|
fn walk_comment_level(listing: ElementRef, op: &str, depth: usize) -> Vec<RedditComment> {
|
||||||
|
listing
|
||||||
|
.children()
|
||||||
|
.filter_map(ElementRef::wrap)
|
||||||
|
.filter(|c| {
|
||||||
|
let val = c.value();
|
||||||
|
val.has_class("comment", scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||||
|
&& val.has_class("thing", scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||||
|
})
|
||||||
|
.filter_map(|c| parse_one_comment(c, op, depth))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_one_comment(c: ElementRef, op: &str, depth: usize) -> Option<RedditComment> {
|
||||||
|
let v = c.value();
|
||||||
|
|
||||||
|
// "load more comments" placeholders are `.thing` with type=morechildren.
|
||||||
|
// They carry a t1_ fullname but no real content — skip them.
|
||||||
|
if v.attr("data-type") == Some("morechildren")
|
||||||
|
|| v.has_class(
|
||||||
|
"morechildren",
|
||||||
|
scraper::CaseSensitivity::AsciiCaseInsensitive,
|
||||||
|
)
|
||||||
|
{
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let is_deleted = v.has_class("deleted", scraper::CaseSensitivity::AsciiCaseInsensitive);
|
||||||
|
let id = v
|
||||||
|
.attr("data-fullname")
|
||||||
|
.map(|s| s.trim_start_matches("t1_").to_string());
|
||||||
|
let author = v
|
||||||
|
.attr("data-author")
|
||||||
|
.filter(|a| !a.is_empty())
|
||||||
|
.unwrap_or("[deleted]")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
// Own body lives in `.entry > form > .usertext-body > .md`. `.child`
|
||||||
|
// (nested replies) is a sibling of `.entry`, so descending within
|
||||||
|
// `.entry` never crosses into a reply's body.
|
||||||
|
let entry = direct_child(c, "entry");
|
||||||
|
let body = entry
|
||||||
|
.and_then(|e| find_class(e, "usertext-body"))
|
||||||
|
.and_then(|ut| find_class(ut, "md"))
|
||||||
|
.map(md_to_markdown)
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.unwrap_or_else(|| {
|
||||||
|
if is_deleted {
|
||||||
|
"[removed]".into()
|
||||||
|
} else {
|
||||||
|
String::new()
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Displayed score is `.score.unvoted`, whose `title` holds the exact
|
||||||
|
// integer (the sibling likes/dislikes spans are ±1). Hidden-score
|
||||||
|
// comments have no `.score.unvoted` span, so `comment_score` returns
|
||||||
|
// None — kept distinct from a genuine 0.
|
||||||
|
let score = entry.and_then(comment_score);
|
||||||
|
|
||||||
|
let created_utc = entry
|
||||||
|
.zip(Selector::parse("time[datetime]").ok())
|
||||||
|
.and_then(|(e, s)| e.select(&s).next())
|
||||||
|
.and_then(|t| t.value().attr("datetime"))
|
||||||
|
.map(str::to_string);
|
||||||
|
|
||||||
|
let is_op = !is_deleted && author != "[deleted]" && author == op;
|
||||||
|
|
||||||
|
// Replies: `.comment > .child > .sitetable > .comment`.
|
||||||
|
let replies = direct_child(c, "child")
|
||||||
|
.and_then(|child| direct_child(child, "sitetable"))
|
||||||
|
.map(|st| walk_comment_level(st, op, depth + 1))
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
Some(RedditComment {
|
||||||
|
id,
|
||||||
|
author,
|
||||||
|
body,
|
||||||
|
score,
|
||||||
|
depth,
|
||||||
|
is_op,
|
||||||
|
created_utc,
|
||||||
|
replies,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read a comment's score from the `.score.unvoted` span inside `.entry`.
|
||||||
|
/// Prefers the `title` attribute (exact integer); falls back to the text.
|
||||||
|
/// Returns `None` when Reddit hides the score (no `.score.unvoted` span).
|
||||||
|
fn comment_score(entry: ElementRef) -> Option<i64> {
|
||||||
|
let sel = Selector::parse("span.score.unvoted").ok()?;
|
||||||
|
let span = entry.select(&sel).next()?;
|
||||||
|
span.value()
|
||||||
|
.attr("title")
|
||||||
|
.and_then(|t| t.trim().parse().ok())
|
||||||
|
.or_else(|| parse_score(&span.text().collect::<String>()))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── DOM helpers ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// First direct child element whose class list includes `class`.
|
||||||
|
fn direct_child<'a>(el: ElementRef<'a>, class: &str) -> Option<ElementRef<'a>> {
|
||||||
|
el.children().filter_map(ElementRef::wrap).find(|c| {
|
||||||
|
c.value()
|
||||||
|
.has_class(class, scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// First descendant (any depth) whose class list includes `class`.
|
||||||
|
fn find_class<'a>(el: ElementRef<'a>, class: &str) -> Option<ElementRef<'a>> {
|
||||||
|
el.children().filter_map(ElementRef::wrap).find_map(|c| {
|
||||||
|
if c.value()
|
||||||
|
.has_class(class, scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||||
|
{
|
||||||
|
Some(c)
|
||||||
|
} else {
|
||||||
|
find_class(c, class)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_score(text: &str) -> Option<i64> {
|
||||||
|
text.split_whitespace()
|
||||||
|
.next()
|
||||||
|
.map(|w| w.replace('−', "-"))
|
||||||
|
.and_then(|w| w.parse().ok())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── .md div → markdown ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
fn md_to_markdown(el: ElementRef) -> String {
|
||||||
|
let mut out = String::new();
|
||||||
|
render_children(el, &mut out);
|
||||||
|
out.trim().to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn render_children(el: ElementRef, out: &mut String) {
|
||||||
|
use scraper::node::Node;
|
||||||
|
for child in el.children() {
|
||||||
|
match child.value() {
|
||||||
|
Node::Text(t) => out.push_str(t.as_ref()),
|
||||||
|
Node::Element(_) => {
|
||||||
|
if let Some(c) = ElementRef::wrap(child) {
|
||||||
|
render_node(c, out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn render_node(el: ElementRef, out: &mut String) {
|
||||||
|
match el.value().name() {
|
||||||
|
"p" | "div" => {
|
||||||
|
let mut inner = String::new();
|
||||||
|
render_children(el, &mut inner);
|
||||||
|
let t = inner.trim();
|
||||||
|
if !t.is_empty() {
|
||||||
|
out.push_str(t);
|
||||||
|
out.push_str("\n\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"br" => out.push('\n'),
|
||||||
|
"strong" | "b" => {
|
||||||
|
let t: String = el.text().collect();
|
||||||
|
let t = t.trim();
|
||||||
|
if !t.is_empty() {
|
||||||
|
out.push_str(&format!("**{t}**"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"em" | "i" => {
|
||||||
|
let t: String = el.text().collect();
|
||||||
|
let t = t.trim();
|
||||||
|
if !t.is_empty() {
|
||||||
|
out.push_str(&format!("*{t}*"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"del" | "s" | "strike" => {
|
||||||
|
let t: String = el.text().collect();
|
||||||
|
let t = t.trim();
|
||||||
|
if !t.is_empty() {
|
||||||
|
out.push_str(&format!("~~{t}~~"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"code" => {
|
||||||
|
let t: String = el.text().collect();
|
||||||
|
out.push('`');
|
||||||
|
out.push_str(t.trim());
|
||||||
|
out.push('`');
|
||||||
|
}
|
||||||
|
"pre" => {
|
||||||
|
let t: String = el.text().collect();
|
||||||
|
out.push_str("```\n");
|
||||||
|
out.push_str(t.trim_end_matches('\n'));
|
||||||
|
out.push_str("\n```\n\n");
|
||||||
|
}
|
||||||
|
"a" => {
|
||||||
|
let text: String = el.text().collect();
|
||||||
|
let text = text.trim();
|
||||||
|
if !text.is_empty() {
|
||||||
|
// Preserve the destination as a markdown link. Resolve
|
||||||
|
// root-relative reddit hrefs (/r/, /user/, /wiki/, ...) and
|
||||||
|
// drop non-navigational ones (javascript:, #fragment, mailto:).
|
||||||
|
let href = el.value().attr("href").unwrap_or("");
|
||||||
|
if href.starts_with("http://") || href.starts_with("https://") {
|
||||||
|
out.push_str(&format!("[{text}]({href})"));
|
||||||
|
} else if href.starts_with('/') {
|
||||||
|
out.push_str(&format!("[{text}](https://old.reddit.com{href})"));
|
||||||
|
} else {
|
||||||
|
out.push_str(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"blockquote" => {
|
||||||
|
let mut inner = String::new();
|
||||||
|
render_children(el, &mut inner);
|
||||||
|
let trimmed = inner.trim();
|
||||||
|
for line in trimmed.lines() {
|
||||||
|
out.push('>');
|
||||||
|
if !line.is_empty() {
|
||||||
|
out.push(' ');
|
||||||
|
out.push_str(line);
|
||||||
|
}
|
||||||
|
out.push('\n');
|
||||||
|
}
|
||||||
|
out.push('\n');
|
||||||
|
}
|
||||||
|
"ul" => render_list(el, false, 0, out),
|
||||||
|
"ol" => render_list(el, true, 0, out),
|
||||||
|
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
|
||||||
|
let level = el
|
||||||
|
.value()
|
||||||
|
.name()
|
||||||
|
.chars()
|
||||||
|
.nth(1)
|
||||||
|
.and_then(|c| c.to_digit(10))
|
||||||
|
.unwrap_or(2) as usize;
|
||||||
|
let t: String = el.text().collect();
|
||||||
|
let t = t.trim();
|
||||||
|
if !t.is_empty() {
|
||||||
|
out.push_str(&"#".repeat(level));
|
||||||
|
out.push(' ');
|
||||||
|
out.push_str(t);
|
||||||
|
out.push_str("\n\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"hr" => out.push_str("---\n\n"),
|
||||||
|
"sup" => {
|
||||||
|
let t: String = el.text().collect();
|
||||||
|
out.push_str(t.trim());
|
||||||
|
}
|
||||||
|
// Unknown / generic containers: recurse
|
||||||
|
_ => render_children(el, out),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Render a `<ul>`/`<ol>`, indenting nested lists by two spaces per level so
|
||||||
|
/// child items keep their own line instead of being glued to the parent.
|
||||||
|
fn render_list(list: ElementRef, ordered: bool, indent: usize, out: &mut String) {
|
||||||
|
use scraper::node::Node;
|
||||||
|
let pad = " ".repeat(indent);
|
||||||
|
let mut n = 0;
|
||||||
|
for li in list
|
||||||
|
.children()
|
||||||
|
.filter_map(ElementRef::wrap)
|
||||||
|
.filter(|c| c.value().name() == "li")
|
||||||
|
{
|
||||||
|
n += 1;
|
||||||
|
// Inline content of this <li>, excluding nested lists (rendered after).
|
||||||
|
let mut inline = String::new();
|
||||||
|
for child in li.children() {
|
||||||
|
match child.value() {
|
||||||
|
Node::Text(t) => inline.push_str(t.as_ref()),
|
||||||
|
Node::Element(e) if e.name() == "ul" || e.name() == "ol" => {}
|
||||||
|
Node::Element(_) => {
|
||||||
|
if let Some(c) = ElementRef::wrap(child) {
|
||||||
|
render_node(c, &mut inline);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let marker = if ordered {
|
||||||
|
format!("{n}. ")
|
||||||
|
} else {
|
||||||
|
"- ".to_string()
|
||||||
|
};
|
||||||
|
out.push_str(&format!("{pad}{marker}{}\n", inline.trim()));
|
||||||
|
|
||||||
|
for child in li.children().filter_map(ElementRef::wrap) {
|
||||||
|
match child.value().name() {
|
||||||
|
"ul" => render_list(child, false, indent + 1, out),
|
||||||
|
"ol" => render_list(child, true, indent + 1, out),
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if indent == 0 {
|
||||||
|
out.push('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── URL helpers ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
fn host_of(url: &str) -> &str {
|
||||||
|
url.split("://")
|
||||||
|
.nth(1)
|
||||||
|
.unwrap_or(url)
|
||||||
|
.split(['/', '?', '#'])
|
||||||
|
.next()
|
||||||
|
.unwrap_or("")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn is_reddit_url_recognises_variants() {
|
||||||
|
assert!(is_reddit_url(
|
||||||
|
"https://www.reddit.com/r/rust/comments/abc/x/"
|
||||||
|
));
|
||||||
|
assert!(is_reddit_url(
|
||||||
|
"https://old.reddit.com/r/rust/comments/abc/x/"
|
||||||
|
));
|
||||||
|
assert!(is_reddit_url("https://reddit.com/r/rust/comments/abc/x/"));
|
||||||
|
assert!(!is_reddit_url("https://example.com"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn try_extract_thread_returns_none_for_listing_url() {
|
||||||
|
let html = "<html><body></body></html>";
|
||||||
|
assert!(try_extract_thread(html, "https://old.reddit.com/r/rust/").is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn md_to_markdown_basic() {
|
||||||
|
let html =
|
||||||
|
Html::parse_fragment(r#"<div class="md"><p>Hello <strong>world</strong>!</p></div>"#);
|
||||||
|
let sel = Selector::parse(".md").unwrap();
|
||||||
|
let el = html.select(&sel).next().unwrap();
|
||||||
|
let md = md_to_markdown(el);
|
||||||
|
assert!(md.contains("**world**"));
|
||||||
|
assert!(md.contains("Hello"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn md_to_markdown_blockquote_and_code() {
|
||||||
|
let html = Html::parse_fragment(
|
||||||
|
r#"<div class="md"><blockquote><p>Quoted</p></blockquote><pre><code>fn main() {}</code></pre></div>"#,
|
||||||
|
);
|
||||||
|
let sel = Selector::parse(".md").unwrap();
|
||||||
|
let el = html.select(&sel).next().unwrap();
|
||||||
|
let md = md_to_markdown(el);
|
||||||
|
assert!(md.contains("> Quoted"));
|
||||||
|
assert!(md.contains("```"));
|
||||||
|
assert!(md.contains("fn main()"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn md_to_markdown_link_preserves_href() {
|
||||||
|
let abs = Html::parse_fragment(
|
||||||
|
r#"<div class="md"><p>see <a href="https://example.com/x">this</a></p></div>"#,
|
||||||
|
);
|
||||||
|
let sel = Selector::parse(".md").unwrap();
|
||||||
|
let el = abs.select(&sel).next().unwrap();
|
||||||
|
assert!(md_to_markdown(el).contains("[this](https://example.com/x)"));
|
||||||
|
|
||||||
|
// Root-relative reddit links resolve against old.reddit.com.
|
||||||
|
let rel = Html::parse_fragment(
|
||||||
|
r#"<div class="md"><p><a href="/r/rust/wiki/faq">faq</a></p></div>"#,
|
||||||
|
);
|
||||||
|
let el = rel.select(&sel).next().unwrap();
|
||||||
|
assert!(md_to_markdown(el).contains("[faq](https://old.reddit.com/r/rust/wiki/faq)"));
|
||||||
|
|
||||||
|
// javascript: / fragment hrefs degrade to bare text.
|
||||||
|
let js = Html::parse_fragment(
|
||||||
|
r#"<div class="md"><p><a href="javascript:void(0)">x</a></p></div>"#,
|
||||||
|
);
|
||||||
|
let el = js.select(&sel).next().unwrap();
|
||||||
|
let out = md_to_markdown(el);
|
||||||
|
assert!(out.contains('x') && !out.contains("javascript"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Regression tests against REAL old.reddit.com HTML ──────────────────
|
||||||
|
//
|
||||||
|
// These fixtures are genuine pages fetched from old.reddit.com (see
|
||||||
|
// testdata/reddit/). They are the ground truth — synthetic HTML is too
|
||||||
|
// easy to write to match wrong assumptions, which is exactly how the
|
||||||
|
// first version of this parser shipped silently broken.
|
||||||
|
|
||||||
|
fn fixture(name: &str) -> String {
|
||||||
|
std::fs::read_to_string(format!("testdata/reddit/{name}")).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn total_comments(cs: &[RedditComment]) -> usize {
|
||||||
|
cs.len() + cs.iter().map(|c| total_comments(&c.replies)).sum::<usize>()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect<'a>(cs: &'a [RedditComment], out: &mut Vec<&'a RedditComment>) {
|
||||||
|
for c in cs {
|
||||||
|
out.push(c);
|
||||||
|
collect(&c.replies, out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn real_link_post_metadata() {
|
||||||
|
// pandas: external-link post (blog.geekuni.com), 34 comments.
|
||||||
|
let html = fixture("pandas_34comments.html");
|
||||||
|
let t = try_extract_thread(
|
||||||
|
&html,
|
||||||
|
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||||
|
)
|
||||||
|
.expect("should parse");
|
||||||
|
let p = t.post.expect("post");
|
||||||
|
assert_eq!(p.author, "Horror-Willingness74");
|
||||||
|
assert_eq!(p.subreddit.as_deref(), Some("programming"));
|
||||||
|
assert_eq!(p.score, 43);
|
||||||
|
assert_eq!(p.num_comments, 34, "data-comments-count");
|
||||||
|
assert!(!p.is_self, "external blog link, not a self post");
|
||||||
|
assert_eq!(
|
||||||
|
p.url.as_deref(),
|
||||||
|
Some("https://blog.geekuni.com/2026/06/why-learn-pandas.html")
|
||||||
|
);
|
||||||
|
assert!(p.title.contains("Pandas"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn real_self_post_metadata() {
|
||||||
|
// A self-post (text) on r/rust: `self.rust` domain, self-text body,
|
||||||
|
// no external url.
|
||||||
|
let html = fixture("rust_selfpost_36comments.html");
|
||||||
|
let t = try_extract_thread(&html, "https://old.reddit.com/r/rust/comments/abc123/t/")
|
||||||
|
.expect("should parse");
|
||||||
|
let p = t.post.expect("post");
|
||||||
|
assert!(p.is_self, "self.rust domain → self post");
|
||||||
|
assert_eq!(p.url, None, "self posts carry no external url");
|
||||||
|
assert_eq!(p.subreddit.as_deref(), Some("rust"));
|
||||||
|
assert!(
|
||||||
|
p.body
|
||||||
|
.as_deref()
|
||||||
|
.unwrap_or("")
|
||||||
|
.contains("IT project manager"),
|
||||||
|
"self-text body should be extracted: {:?}",
|
||||||
|
p.body
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn real_comment_bodies_and_scores() {
|
||||||
|
// The original bug: every comment body came back empty because
|
||||||
|
// .usertext-body sits inside a <form>, not directly under .entry.
|
||||||
|
let html = fixture("ebpf_6comments.html");
|
||||||
|
let t = try_extract_thread(
|
||||||
|
&html,
|
||||||
|
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||||
|
)
|
||||||
|
.expect("should parse");
|
||||||
|
// 6 comments total: 5 top-level + 1 nested reply (admalledd under ejrh).
|
||||||
|
assert_eq!(t.comments.len(), 5, "5 top-level comments");
|
||||||
|
assert_eq!(total_comments(&t.comments), 6, "6 comments incl. nested");
|
||||||
|
let teerre = t
|
||||||
|
.comments
|
||||||
|
.iter()
|
||||||
|
.find(|c| c.author == "teerre")
|
||||||
|
.expect("teerre");
|
||||||
|
assert!(
|
||||||
|
teerre.body.contains("Very cool blog"),
|
||||||
|
"body must be populated, got {:?}",
|
||||||
|
teerre.body
|
||||||
|
);
|
||||||
|
// Score comes from .score.unvoted title (the real value), not the
|
||||||
|
// ±1 likes/dislikes siblings.
|
||||||
|
assert_eq!(
|
||||||
|
teerre.score,
|
||||||
|
Some(10),
|
||||||
|
"unvoted score, not dislikes(9)/likes(11)"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
t.comments.iter().all(|c| !c.body.is_empty()),
|
||||||
|
"no comment body should be empty"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn real_nested_comment_tree() {
|
||||||
|
// pandas has structurally-nested replies (.child > .sitetable >
|
||||||
|
// .comment). data-depth/data-replies are absent in logged-out HTML.
|
||||||
|
let html = fixture("pandas_34comments.html");
|
||||||
|
let t = try_extract_thread(
|
||||||
|
&html,
|
||||||
|
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||||
|
)
|
||||||
|
.expect("should parse");
|
||||||
|
// 34 rendered comments with content + 1 [deleted] node that old.reddit
|
||||||
|
// still shows because it has live replies = 35 nodes in the tree.
|
||||||
|
assert_eq!(
|
||||||
|
total_comments(&t.comments),
|
||||||
|
35,
|
||||||
|
"all comments incl. nested + deleted"
|
||||||
|
);
|
||||||
|
let nested = t.comments.iter().any(|c| !c.replies.is_empty());
|
||||||
|
assert!(nested, "at least one comment must have replies");
|
||||||
|
let max_depth = {
|
||||||
|
fn d(cs: &[RedditComment]) -> usize {
|
||||||
|
cs.iter().map(|c| 1 + d(&c.replies)).max().unwrap_or(0)
|
||||||
|
}
|
||||||
|
d(&t.comments)
|
||||||
|
};
|
||||||
|
assert!(max_depth >= 2, "tree should be more than one level deep");
|
||||||
|
let a_reply = t.comments.iter().find_map(|c| c.replies.first());
|
||||||
|
assert_eq!(a_reply.map(|r| r.depth), Some(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn real_morechildren_stubs_skipped() {
|
||||||
|
// AskReddit deep thread: 259 .thing[data-fullname=t1_] markers, but
|
||||||
|
// some are "load more comments" stubs (data-type=morechildren) with
|
||||||
|
// no author/body. They must not appear as ghost comments.
|
||||||
|
let html = fixture("askreddit_deep_morechildren.html");
|
||||||
|
let t = try_extract_thread(
|
||||||
|
&html,
|
||||||
|
"https://old.reddit.com/r/AskReddit/comments/abc123/t/",
|
||||||
|
)
|
||||||
|
.expect("should parse");
|
||||||
|
fn check(cs: &[RedditComment]) {
|
||||||
|
for c in cs {
|
||||||
|
let ghost = c.body.is_empty() && c.author == "[deleted]" && c.id.is_some();
|
||||||
|
assert!(!ghost, "morechildren stub leaked as comment: {:?}", c.id);
|
||||||
|
check(&c.replies);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
check(&t.comments);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn real_hidden_score_is_none_not_zero() {
|
||||||
|
// AskReddit has fresh comments with `.score-hidden` (no .score.unvoted
|
||||||
|
// span). These must be None, distinct from a genuine 0-score comment.
|
||||||
|
let html = fixture("askreddit_deep_morechildren.html");
|
||||||
|
let t = try_extract_thread(
|
||||||
|
&html,
|
||||||
|
"https://old.reddit.com/r/AskReddit/comments/abc123/t/",
|
||||||
|
)
|
||||||
|
.expect("should parse");
|
||||||
|
let mut all = Vec::new();
|
||||||
|
collect(&t.comments, &mut all);
|
||||||
|
assert!(
|
||||||
|
all.iter().any(|c| c.score.is_none()),
|
||||||
|
"some fresh comments have hidden scores → None"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn real_deleted_comment_preserves_subtree() {
|
||||||
|
// pandas has a [deleted] comment that still has visible replies. The
|
||||||
|
// structural walk must keep it so its children aren't orphaned.
|
||||||
|
let html = fixture("pandas_34comments.html");
|
||||||
|
let t = try_extract_thread(
|
||||||
|
&html,
|
||||||
|
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||||
|
)
|
||||||
|
.expect("should parse");
|
||||||
|
let mut all = Vec::new();
|
||||||
|
collect(&t.comments, &mut all);
|
||||||
|
let deleted: Vec<_> = all.iter().filter(|c| c.author == "[deleted]").collect();
|
||||||
|
assert!(!deleted.is_empty(), "should keep deleted comments");
|
||||||
|
assert!(
|
||||||
|
deleted.iter().any(|c| !c.replies.is_empty()),
|
||||||
|
"a deleted comment with replies must retain its subtree"
|
||||||
|
);
|
||||||
|
assert!(deleted.iter().all(|c| !c.is_op));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn real_markdown_is_commonmark_clean() {
|
||||||
|
// Guards the markdown bugs the verification workflow found: no
|
||||||
|
// whitespace-only "blank" lines, and ``` fences never indented 4+
|
||||||
|
// spaces (which would turn them into literal indented code blocks).
|
||||||
|
let html = fixture("elixir_60comments.html");
|
||||||
|
let result = try_extract(
|
||||||
|
&html,
|
||||||
|
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||||
|
)
|
||||||
|
.expect("should extract");
|
||||||
|
let md = &result.content.markdown;
|
||||||
|
assert!(md.starts_with("# "));
|
||||||
|
assert!(md.contains("## Comments"));
|
||||||
|
for line in md.lines() {
|
||||||
|
assert!(
|
||||||
|
!(line.starts_with(' ') && line.trim().is_empty()),
|
||||||
|
"whitespace-only line: {line:?}"
|
||||||
|
);
|
||||||
|
let trimmed = line.trim_start_matches(['>', ' ']);
|
||||||
|
if trimmed.starts_with("```") {
|
||||||
|
let indent = line.len() - line.trim_start_matches(' ').len();
|
||||||
|
assert!(indent < 4, "code fence indented {indent} spaces: {line:?}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert!(result.metadata.word_count > 20);
|
||||||
|
}
|
||||||
|
}
|
||||||
596
crates/webclaw-core/testdata/reddit/askreddit_deep_morechildren.html
vendored
Normal file
596
crates/webclaw-core/testdata/reddit/askreddit_deep_morechildren.html
vendored
Normal file
File diff suppressed because one or more lines are too long
82
crates/webclaw-core/testdata/reddit/ebpf_6comments.html
vendored
Normal file
82
crates/webclaw-core/testdata/reddit/ebpf_6comments.html
vendored
Normal file
File diff suppressed because one or more lines are too long
312
crates/webclaw-core/testdata/reddit/elixir_60comments.html
vendored
Normal file
312
crates/webclaw-core/testdata/reddit/elixir_60comments.html
vendored
Normal file
File diff suppressed because one or more lines are too long
227
crates/webclaw-core/testdata/reddit/pandas_34comments.html
vendored
Normal file
227
crates/webclaw-core/testdata/reddit/pandas_34comments.html
vendored
Normal file
File diff suppressed because one or more lines are too long
234
crates/webclaw-core/testdata/reddit/rust_selfpost_36comments.html
vendored
Normal file
234
crates/webclaw-core/testdata/reddit/rust_selfpost_36comments.html
vendored
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -13,7 +13,12 @@ thiserror = { workspace = true }
|
||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
wreq = { version = "6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] }
|
# Pinned to exact pre-release versions: wreq/wreq-util are release candidates
|
||||||
|
# with no semver stability between rc.N builds (rc.29 broke the TLS + Response
|
||||||
|
# API). An exact pin keeps `cargo build`, `cargo install` (which ignores
|
||||||
|
# Cargo.lock), and the release workflow all on the version that compiles.
|
||||||
|
wreq = { version = "=6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] }
|
||||||
|
wreq-util = "=3.0.0-rc.10"
|
||||||
http = "1"
|
http = "1"
|
||||||
bytes = "1"
|
bytes = "1"
|
||||||
url = "2"
|
url = "2"
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,10 @@ pub enum BrowserProfile {
|
||||||
#[default]
|
#[default]
|
||||||
Chrome,
|
Chrome,
|
||||||
Firefox,
|
Firefox,
|
||||||
|
/// Safari iOS 26 (iPhone). The one profile proven to defeat
|
||||||
|
/// DataDome's immobiliare.it / idealista.it / target.com-class
|
||||||
|
/// rules when paired with a country-scoped residential proxy.
|
||||||
|
SafariIos,
|
||||||
/// Randomly pick from all available profiles on each request.
|
/// Randomly pick from all available profiles on each request.
|
||||||
Random,
|
Random,
|
||||||
}
|
}
|
||||||
|
|
@ -18,6 +22,7 @@ pub enum BrowserVariant {
|
||||||
ChromeMacos,
|
ChromeMacos,
|
||||||
Firefox,
|
Firefox,
|
||||||
Safari,
|
Safari,
|
||||||
|
SafariIos26,
|
||||||
Edge,
|
Edge,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -95,12 +95,30 @@ struct Response {
|
||||||
/// per page in collapse_whitespace + strip_markdown).
|
/// per page in collapse_whitespace + strip_markdown).
|
||||||
const MAX_BODY_BYTES: u64 = 50 * 1024 * 1024;
|
const MAX_BODY_BYTES: u64 = 50 * 1024 * 1024;
|
||||||
|
|
||||||
|
/// Running decompression-bomb guard: reject as soon as the bytes already
|
||||||
|
/// buffered plus the next decompressed chunk would cross [`MAX_BODY_BYTES`].
|
||||||
|
/// Saturating arithmetic so a huge chunk length can't wrap the sum.
|
||||||
|
fn check_body_ceiling(buffered: usize, next_chunk: usize) -> Result<(), FetchError> {
|
||||||
|
let total = (buffered as u64).saturating_add(next_chunk as u64);
|
||||||
|
if total > MAX_BODY_BYTES {
|
||||||
|
return Err(FetchError::BodyDecode(format!(
|
||||||
|
"response body exceeds cap {MAX_BODY_BYTES} bytes (decompressed)"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
impl Response {
|
impl Response {
|
||||||
/// Buffer a wreq response into an owned Response. Rejects bodies that
|
/// Buffer a wreq response into an owned Response.
|
||||||
/// advertise a Content-Length beyond [`MAX_BODY_BYTES`] before we pay
|
///
|
||||||
/// the allocation, and truncates after the fact as a belt-and-braces
|
/// Rejects bodies that advertise a Content-Length beyond
|
||||||
/// check against a lying server.
|
/// [`MAX_BODY_BYTES`] before we pay any allocation, then streams the
|
||||||
async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
|
/// body chunk-by-chunk while enforcing a running ceiling. `chunk()`
|
||||||
|
/// yields *post-decompression* bytes (gzip/brotli/zstd/deflate are
|
||||||
|
/// negotiated), so a tiny compressed payload that inflates to
|
||||||
|
/// gigabytes is aborted as soon as the accumulated size crosses the
|
||||||
|
/// cap — it never gets fully buffered in memory.
|
||||||
|
async fn from_wreq(mut resp: wreq::Response) -> Result<Self, FetchError> {
|
||||||
if let Some(len) = resp.content_length()
|
if let Some(len) = resp.content_length()
|
||||||
&& len > MAX_BODY_BYTES
|
&& len > MAX_BODY_BYTES
|
||||||
{
|
{
|
||||||
|
|
@ -111,21 +129,22 @@ impl Response {
|
||||||
let status = resp.status().as_u16();
|
let status = resp.status().as_u16();
|
||||||
let url = resp.uri().to_string();
|
let url = resp.uri().to_string();
|
||||||
let headers = resp.headers().clone();
|
let headers = resp.headers().clone();
|
||||||
let body = resp
|
|
||||||
.bytes()
|
let mut buf = bytes::BytesMut::new();
|
||||||
|
while let Some(chunk) = resp
|
||||||
|
.chunk()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
.map_err(|e| FetchError::BodyDecode(e.to_string()))?
|
||||||
if body.len() as u64 > MAX_BODY_BYTES {
|
{
|
||||||
return Err(FetchError::BodyDecode(format!(
|
check_body_ceiling(buf.len(), chunk.len())?;
|
||||||
"response body {} bytes exceeds cap {MAX_BODY_BYTES}",
|
buf.extend_from_slice(&chunk);
|
||||||
body.len()
|
|
||||||
)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
status,
|
status,
|
||||||
url,
|
url,
|
||||||
headers,
|
headers,
|
||||||
body,
|
body: buf.freeze(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -141,9 +160,6 @@ impl Response {
|
||||||
fn body(&self) -> &[u8] {
|
fn body(&self) -> &[u8] {
|
||||||
&self.body
|
&self.body
|
||||||
}
|
}
|
||||||
fn is_success(&self) -> bool {
|
|
||||||
(200..300).contains(&self.status)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn text(&self) -> std::borrow::Cow<'_, str> {
|
fn text(&self) -> std::borrow::Cow<'_, str> {
|
||||||
String::from_utf8_lossy(&self.body)
|
String::from_utf8_lossy(&self.body)
|
||||||
|
|
@ -199,6 +215,8 @@ impl FetchClient {
|
||||||
config.timeout,
|
config.timeout,
|
||||||
&config.headers,
|
&config.headers,
|
||||||
config.proxy.as_deref(),
|
config.proxy.as_deref(),
|
||||||
|
config.follow_redirects,
|
||||||
|
config.max_redirects,
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
.collect::<Result<Vec<_>, _>>()?;
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
|
@ -218,7 +236,14 @@ impl FetchClient {
|
||||||
.iter()
|
.iter()
|
||||||
.map(|proxy| {
|
.map(|proxy| {
|
||||||
let v = *variants.choose(&mut rng).unwrap();
|
let v = *variants.choose(&mut rng).unwrap();
|
||||||
crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy))
|
crate::tls::build_client(
|
||||||
|
v,
|
||||||
|
config.timeout,
|
||||||
|
&config.headers,
|
||||||
|
Some(proxy),
|
||||||
|
config.follow_redirects,
|
||||||
|
config.max_redirects,
|
||||||
|
)
|
||||||
})
|
})
|
||||||
.collect::<Result<Vec<_>, _>>()?;
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
|
||||||
|
|
@ -261,10 +286,48 @@ impl FetchClient {
|
||||||
self.cloud.as_deref()
|
self.cloud.as_deref()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Fetch a URL with per-site rescue paths: Reddit URLs redirect to the
|
||||||
|
/// `.json` API, and Akamai-style challenge responses trigger a homepage
|
||||||
|
/// cookie warmup and a retry. Returns the same `FetchResult` shape as
|
||||||
|
/// [`Self::fetch`] so every caller (CLI, MCP, OSS server, production
|
||||||
|
/// server) benefits without shape churn.
|
||||||
|
///
|
||||||
|
/// This is the method most callers want. Use plain [`Self::fetch`] only
|
||||||
|
/// when you need literal no-rescue behavior (e.g. inside the rescue
|
||||||
|
/// logic itself to avoid recursion).
|
||||||
|
pub async fn fetch_smart(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||||
|
// Reddit: fetch old.reddit.com for stable server-rendered HTML.
|
||||||
|
// The JSON API is blocked; old.reddit.com works without JS or auth.
|
||||||
|
let owned;
|
||||||
|
let url = if crate::reddit::is_reddit_url(url) {
|
||||||
|
owned = crate::reddit::to_old_reddit_url(url);
|
||||||
|
owned.as_str()
|
||||||
|
} else {
|
||||||
|
url
|
||||||
|
};
|
||||||
|
|
||||||
|
let resp = self.fetch(url).await?;
|
||||||
|
|
||||||
|
// Akamai / bazadebezolkohpepadr challenge: visit the homepage to
|
||||||
|
// collect warmup cookies (_abck, bm_sz, etc.), then retry.
|
||||||
|
if is_challenge_html(&resp.html)
|
||||||
|
&& let Some(homepage) = extract_homepage(url)
|
||||||
|
{
|
||||||
|
debug!("challenge detected, warming cookies via {homepage}");
|
||||||
|
let _ = self.fetch(&homepage).await;
|
||||||
|
if let Ok(retry) = self.fetch(url).await {
|
||||||
|
return Ok(retry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(resp)
|
||||||
|
}
|
||||||
|
|
||||||
/// Fetch a URL and return the raw HTML + response metadata.
|
/// Fetch a URL and return the raw HTML + response metadata.
|
||||||
///
|
///
|
||||||
/// Automatically retries on transient failures (network errors, 5xx, 429)
|
/// Automatically retries on transient failures (network errors, 5xx, 429)
|
||||||
/// with exponential backoff: 0s, 1s (2 attempts total).
|
/// with exponential backoff: 0s, 1s (2 attempts total). No per-site
|
||||||
|
/// rescue logic; use [`Self::fetch_smart`] for that.
|
||||||
#[instrument(skip(self), fields(url = %url))]
|
#[instrument(skip(self), fields(url = %url))]
|
||||||
pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
|
pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||||
let delays = [Duration::ZERO, Duration::from_secs(1)];
|
let delays = [Duration::ZERO, Duration::from_secs(1)];
|
||||||
|
|
@ -324,6 +387,8 @@ impl FetchClient {
|
||||||
url: &str,
|
url: &str,
|
||||||
extra: &[(&str, &str)],
|
extra: &[(&str, &str)],
|
||||||
) -> Result<FetchResult, FetchError> {
|
) -> Result<FetchResult, FetchError> {
|
||||||
|
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
|
||||||
|
let url = parsed_url.as_str();
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let client = self.pick_client(url);
|
let client = self.pick_client(url);
|
||||||
|
|
||||||
|
|
@ -408,22 +473,19 @@ impl FetchClient {
|
||||||
url: &str,
|
url: &str,
|
||||||
options: &webclaw_core::ExtractionOptions,
|
options: &webclaw_core::ExtractionOptions,
|
||||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||||
// Reddit fallback: use their JSON API to get post + full comment tree.
|
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
|
||||||
if crate::reddit::is_reddit_url(url) {
|
let url = parsed_url.as_str();
|
||||||
let json_url = crate::reddit::json_url(url);
|
|
||||||
debug!("reddit detected, fetching {json_url}");
|
|
||||||
|
|
||||||
let client = self.pick_client(url);
|
// Reddit: rewrite to old.reddit.com for stable server-rendered HTML.
|
||||||
let resp = client.get(&json_url).send().await?;
|
// webclaw-core's Reddit fast path then parses the thread structure.
|
||||||
let response = Response::from_wreq(resp).await?;
|
let reddit_owned;
|
||||||
if response.is_success() {
|
let url = if crate::reddit::is_reddit_url(url) {
|
||||||
let bytes = response.body();
|
reddit_owned = crate::reddit::to_old_reddit_url(url);
|
||||||
match crate::reddit::parse_reddit_json(bytes, url) {
|
debug!("reddit: rewriting to {reddit_owned}");
|
||||||
Ok(result) => return Ok(result),
|
reddit_owned.as_str()
|
||||||
Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
|
} else {
|
||||||
}
|
url
|
||||||
}
|
};
|
||||||
}
|
|
||||||
|
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let client = self.pick_client(url);
|
let client = self.pick_client(url);
|
||||||
|
|
@ -436,7 +498,7 @@ impl FetchClient {
|
||||||
&& let Some(homepage) = extract_homepage(url)
|
&& let Some(homepage) = extract_homepage(url)
|
||||||
{
|
{
|
||||||
debug!("challenge detected, warming cookies via {homepage}");
|
debug!("challenge detected, warming cookies via {homepage}");
|
||||||
let _ = client.get(&homepage).send().await;
|
let _ = self.fetch(&homepage).await;
|
||||||
let resp = client.get(url).send().await?;
|
let resp = client.get(url).send().await?;
|
||||||
response = Response::from_wreq(resp).await?;
|
response = Response::from_wreq(resp).await?;
|
||||||
debug!("retried after cookie warmup: status={}", response.status());
|
debug!("retried after cookie warmup: status={}", response.status());
|
||||||
|
|
@ -635,6 +697,7 @@ fn collect_variants(profile: &BrowserProfile) -> Vec<BrowserVariant> {
|
||||||
BrowserProfile::Random => browser::all_variants(),
|
BrowserProfile::Random => browser::all_variants(),
|
||||||
BrowserProfile::Chrome => vec![browser::latest_chrome()],
|
BrowserProfile::Chrome => vec![browser::latest_chrome()],
|
||||||
BrowserProfile::Firefox => vec![browser::latest_firefox()],
|
BrowserProfile::Firefox => vec![browser::latest_firefox()],
|
||||||
|
BrowserProfile::SafariIos => vec![BrowserVariant::SafariIos26],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -712,22 +775,27 @@ fn is_pdf_content_type(headers: &http::HeaderMap) -> bool {
|
||||||
|
|
||||||
/// Detect if a response looks like a bot protection challenge page.
|
/// Detect if a response looks like a bot protection challenge page.
|
||||||
fn is_challenge_response(response: &Response) -> bool {
|
fn is_challenge_response(response: &Response) -> bool {
|
||||||
let len = response.body().len();
|
let body_len = response.body().len();
|
||||||
|
if body_len > 15_000 || body_len == 0 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
is_challenge_html(response.text().as_ref())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as `is_challenge_response`, operating on a body string directly
|
||||||
|
/// so callers holding a `FetchResult` can reuse the heuristic.
|
||||||
|
fn is_challenge_html(html: &str) -> bool {
|
||||||
|
let len = html.len();
|
||||||
if len > 15_000 || len == 0 {
|
if len > 15_000 || len == 0 {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
let lower = html.to_lowercase();
|
||||||
let text = response.text();
|
|
||||||
let lower = text.to_lowercase();
|
|
||||||
|
|
||||||
if lower.contains("<title>challenge page</title>") {
|
if lower.contains("<title>challenge page</title>") {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
|
if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -820,6 +888,28 @@ mod tests {
|
||||||
assert!(err.result.is_err());
|
assert!(err.result.is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn body_ceiling_allows_under_cap() {
|
||||||
|
assert!(check_body_ceiling(0, 1024).is_ok());
|
||||||
|
assert!(check_body_ceiling(MAX_BODY_BYTES as usize - 1, 1).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn body_ceiling_rejects_at_and_over_cap() {
|
||||||
|
// Exactly at the cap is allowed; one byte over is rejected.
|
||||||
|
assert!(check_body_ceiling(MAX_BODY_BYTES as usize, 1).is_err());
|
||||||
|
// A small buffer plus a huge inflated chunk (decompression bomb)
|
||||||
|
// is caught on the very first oversized chunk.
|
||||||
|
let err = check_body_ceiling(16, 64 * 1024 * 1024).unwrap_err();
|
||||||
|
assert!(matches!(err, FetchError::BodyDecode(_)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn body_ceiling_saturates_on_overflow() {
|
||||||
|
// usize::MAX chunk must not wrap the running sum to a small value.
|
||||||
|
assert!(check_body_ceiling(usize::MAX, usize::MAX).is_err());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_batch_extract_result_struct() {
|
fn test_batch_extract_result_struct() {
|
||||||
let err = BatchExtractResult {
|
let err = BatchExtractResult {
|
||||||
|
|
|
||||||
|
|
@ -810,13 +810,18 @@ mod tests {
|
||||||
|
|
||||||
// --- CloudClient construction ------------------------------------------
|
// --- CloudClient construction ------------------------------------------
|
||||||
|
|
||||||
|
// `WEBCLAW_API_KEY` is process-global; cargo runs tests in parallel
|
||||||
|
// threads. Without serialization, a test that sets the var can race a
|
||||||
|
// test asserting it is absent. This lock makes the env-mutating
|
||||||
|
// CloudClient tests mutually exclusive (poison-tolerant: a panicking
|
||||||
|
// test must not wedge the others).
|
||||||
|
static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn cloud_client_explicit_key_wins_over_env() {
|
fn cloud_client_explicit_key_wins_over_env() {
|
||||||
// SAFETY: this test mutates process env. Serial tests only.
|
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
|
||||||
// Set env to something, pass an explicit key, explicit should win.
|
// SAFETY: env mutation is serialized by ENV_LOCK; set_var/remove_var
|
||||||
// (We don't actually *call* the API, just check the struct stored
|
// are unsafe on the 2024 toolchain. Explicit key must beat the env.
|
||||||
// the right key.)
|
|
||||||
// rustc std::env::set_var is unsafe in newer toolchains.
|
|
||||||
unsafe {
|
unsafe {
|
||||||
std::env::set_var("WEBCLAW_API_KEY", "from-env");
|
std::env::set_var("WEBCLAW_API_KEY", "from-env");
|
||||||
}
|
}
|
||||||
|
|
@ -829,6 +834,9 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn cloud_client_none_when_empty() {
|
fn cloud_client_none_when_empty() {
|
||||||
|
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
|
||||||
|
// SAFETY: env mutation serialized by ENV_LOCK. Clearing the var
|
||||||
|
// (incl. any ambient runner value) is what makes this deterministic.
|
||||||
unsafe {
|
unsafe {
|
||||||
std::env::remove_var("WEBCLAW_API_KEY");
|
std::env::remove_var("WEBCLAW_API_KEY");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,7 @@ use std::sync::OnceLock;
|
||||||
|
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use serde_json::{Value, json};
|
use serde_json::{Value, json};
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
use super::ExtractorInfo;
|
use super::ExtractorInfo;
|
||||||
use crate::cloud::{self, CloudError};
|
use crate::cloud::{self, CloudError};
|
||||||
|
|
@ -52,8 +53,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||||
};
|
};
|
||||||
|
|
||||||
pub fn matches(url: &str) -> bool {
|
pub fn matches(url: &str) -> bool {
|
||||||
let host = host_of(url);
|
let Some(host) = host_of(url) else {
|
||||||
if !is_amazon_host(host) {
|
return false;
|
||||||
|
};
|
||||||
|
if !is_amazon_host(&host) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
parse_asin(url).is_some()
|
parse_asin(url).is_some()
|
||||||
|
|
@ -162,17 +165,41 @@ pub fn parse(html: &str, url: &str, asin: &str) -> Value {
|
||||||
// URL helpers
|
// URL helpers
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
fn host_of(url: &str) -> &str {
|
fn host_of(url: &str) -> Option<String> {
|
||||||
url.split("://")
|
let parsed = Url::parse(url).ok()?;
|
||||||
.nth(1)
|
if !parsed.username().is_empty() || parsed.password().is_some() {
|
||||||
.unwrap_or(url)
|
return None;
|
||||||
.split('/')
|
}
|
||||||
.next()
|
parsed.host_str().map(|host| host.to_ascii_lowercase())
|
||||||
.unwrap_or("")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_amazon_host(host: &str) -> bool {
|
fn is_amazon_host(host: &str) -> bool {
|
||||||
host.starts_with("www.amazon.") || host.starts_with("amazon.")
|
const AMAZON_HOSTS: &[&str] = &[
|
||||||
|
"amazon.ae",
|
||||||
|
"amazon.ca",
|
||||||
|
"amazon.cn",
|
||||||
|
"amazon.co.jp",
|
||||||
|
"amazon.co.uk",
|
||||||
|
"amazon.com",
|
||||||
|
"amazon.com.au",
|
||||||
|
"amazon.com.be",
|
||||||
|
"amazon.com.br",
|
||||||
|
"amazon.com.mx",
|
||||||
|
"amazon.com.tr",
|
||||||
|
"amazon.de",
|
||||||
|
"amazon.eg",
|
||||||
|
"amazon.es",
|
||||||
|
"amazon.fr",
|
||||||
|
"amazon.in",
|
||||||
|
"amazon.it",
|
||||||
|
"amazon.nl",
|
||||||
|
"amazon.pl",
|
||||||
|
"amazon.sa",
|
||||||
|
"amazon.se",
|
||||||
|
"amazon.sg",
|
||||||
|
];
|
||||||
|
let normalized = host.strip_prefix("www.").unwrap_or(host);
|
||||||
|
AMAZON_HOSTS.contains(&normalized)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Pull a 10-char ASIN out of any recognised Amazon URL shape:
|
/// Pull a 10-char ASIN out of any recognised Amazon URL shape:
|
||||||
|
|
@ -347,6 +374,9 @@ mod tests {
|
||||||
assert!(matches("https://www.amazon.com/dp/B0CHX1W1XY"));
|
assert!(matches("https://www.amazon.com/dp/B0CHX1W1XY"));
|
||||||
assert!(matches("https://www.amazon.co.uk/dp/B0CHX1W1XY/"));
|
assert!(matches("https://www.amazon.co.uk/dp/B0CHX1W1XY/"));
|
||||||
assert!(matches("https://www.amazon.de/dp/B0CHX1W1XY?psc=1"));
|
assert!(matches("https://www.amazon.de/dp/B0CHX1W1XY?psc=1"));
|
||||||
|
assert!(matches("https://www.amazon.ca/dp/B0CHX1W1XY"));
|
||||||
|
assert!(matches("https://www.amazon.com.au/dp/B0CHX1W1XY"));
|
||||||
|
assert!(matches("https://www.amazon.in/dp/B0CHX1W1XY"));
|
||||||
assert!(matches(
|
assert!(matches(
|
||||||
"https://www.amazon.com/gp/product/B0CHX1W1XY/ref=foo"
|
"https://www.amazon.com/gp/product/B0CHX1W1XY/ref=foo"
|
||||||
));
|
));
|
||||||
|
|
@ -357,6 +387,8 @@ mod tests {
|
||||||
assert!(!matches("https://www.amazon.com/"));
|
assert!(!matches("https://www.amazon.com/"));
|
||||||
assert!(!matches("https://www.amazon.com/gp/cart"));
|
assert!(!matches("https://www.amazon.com/gp/cart"));
|
||||||
assert!(!matches("https://example.com/dp/B0CHX1W1XY"));
|
assert!(!matches("https://example.com/dp/B0CHX1W1XY"));
|
||||||
|
assert!(!matches("https://www.amazon.com@127.0.0.1/dp/B0CHX1W1XY"));
|
||||||
|
assert!(!matches("https://www.amazon.evil.com/dp/B0CHX1W1XY"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ use std::sync::OnceLock;
|
||||||
|
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use serde_json::{Value, json};
|
use serde_json::{Value, json};
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
use super::ExtractorInfo;
|
use super::ExtractorInfo;
|
||||||
use crate::cloud::{self, CloudError};
|
use crate::cloud::{self, CloudError};
|
||||||
|
|
@ -32,8 +33,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||||
};
|
};
|
||||||
|
|
||||||
pub fn matches(url: &str) -> bool {
|
pub fn matches(url: &str) -> bool {
|
||||||
let host = host_of(url);
|
let Some(host) = host_of(url) else {
|
||||||
if !is_ebay_host(host) {
|
return false;
|
||||||
|
};
|
||||||
|
if !is_ebay_host(&host) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
parse_item_id(url).is_some()
|
parse_item_id(url).is_some()
|
||||||
|
|
@ -120,17 +123,37 @@ pub fn parse(html: &str, url: &str, item_id: &str) -> Value {
|
||||||
// URL helpers
|
// URL helpers
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
fn host_of(url: &str) -> &str {
|
fn host_of(url: &str) -> Option<String> {
|
||||||
url.split("://")
|
let parsed = Url::parse(url).ok()?;
|
||||||
.nth(1)
|
if !parsed.username().is_empty() || parsed.password().is_some() {
|
||||||
.unwrap_or(url)
|
return None;
|
||||||
.split('/')
|
}
|
||||||
.next()
|
parsed.host_str().map(|host| host.to_ascii_lowercase())
|
||||||
.unwrap_or("")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_ebay_host(host: &str) -> bool {
|
fn is_ebay_host(host: &str) -> bool {
|
||||||
host.starts_with("www.ebay.") || host.starts_with("ebay.")
|
const EBAY_HOSTS: &[&str] = &[
|
||||||
|
"ebay.at",
|
||||||
|
"ebay.be",
|
||||||
|
"ebay.ca",
|
||||||
|
"ebay.ch",
|
||||||
|
"ebay.co.uk",
|
||||||
|
"ebay.com",
|
||||||
|
"ebay.com.au",
|
||||||
|
"ebay.com.hk",
|
||||||
|
"ebay.com.my",
|
||||||
|
"ebay.com.sg",
|
||||||
|
"ebay.de",
|
||||||
|
"ebay.es",
|
||||||
|
"ebay.fr",
|
||||||
|
"ebay.ie",
|
||||||
|
"ebay.it",
|
||||||
|
"ebay.nl",
|
||||||
|
"ebay.ph",
|
||||||
|
"ebay.pl",
|
||||||
|
];
|
||||||
|
let normalized = host.strip_prefix("www.").unwrap_or(host);
|
||||||
|
EBAY_HOSTS.contains(&normalized)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Pull the numeric item id out of `/itm/{id}` or `/itm/{slug}/{id}`
|
/// Pull the numeric item id out of `/itm/{id}` or `/itm/{slug}/{id}`
|
||||||
|
|
@ -273,9 +296,14 @@ mod tests {
|
||||||
"https://www.ebay.com/itm/vintage-typewriter/325478156234"
|
"https://www.ebay.com/itm/vintage-typewriter/325478156234"
|
||||||
));
|
));
|
||||||
assert!(matches("https://www.ebay.co.uk/itm/325478156234"));
|
assert!(matches("https://www.ebay.co.uk/itm/325478156234"));
|
||||||
|
assert!(matches("https://www.ebay.ca/itm/325478156234"));
|
||||||
|
assert!(matches("https://www.ebay.com.au/itm/325478156234"));
|
||||||
|
assert!(matches("https://www.ebay.es/itm/325478156234"));
|
||||||
assert!(!matches("https://www.ebay.com/"));
|
assert!(!matches("https://www.ebay.com/"));
|
||||||
assert!(!matches("https://www.ebay.com/sch/foo"));
|
assert!(!matches("https://www.ebay.com/sch/foo"));
|
||||||
assert!(!matches("https://example.com/itm/325478156234"));
|
assert!(!matches("https://example.com/itm/325478156234"));
|
||||||
|
assert!(!matches("https://www.ebay.com@127.0.0.1/itm/325478156234"));
|
||||||
|
assert!(!matches("https://www.ebay.attacker.com/itm/325478156234"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,10 @@
|
||||||
//! Reddit structured extractor — returns the full post + comment tree
|
//! Reddit structured extractor — parses old.reddit.com HTML.
|
||||||
//! as typed JSON via Reddit's `.json` API.
|
|
||||||
//!
|
//!
|
||||||
//! The same trick the markdown extractor in `crate::reddit` uses:
|
//! Fetches old.reddit.com (stable server-rendered HTML, no JS required)
|
||||||
//! appending `.json` to any post URL returns the data the new SPA
|
//! and delegates parsing to `webclaw_core::reddit`. Returns a typed JSON
|
||||||
//! frontend would load client-side. Zero antibot, zero JS rendering.
|
//! value with `{ url, post, comments }` structure.
|
||||||
|
|
||||||
use serde::Deserialize;
|
use serde_json::Value;
|
||||||
use serde_json::{Value, json};
|
|
||||||
|
|
||||||
use super::ExtractorInfo;
|
use super::ExtractorInfo;
|
||||||
use crate::error::FetchError;
|
use crate::error::FetchError;
|
||||||
|
|
@ -24,182 +22,27 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||||
};
|
};
|
||||||
|
|
||||||
pub fn matches(url: &str) -> bool {
|
pub fn matches(url: &str) -> bool {
|
||||||
let host = host_of(url);
|
webclaw_core::reddit::is_reddit_url(url) && url.contains("/comments/")
|
||||||
let is_reddit_host = matches!(
|
|
||||||
host,
|
|
||||||
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
|
||||||
);
|
|
||||||
is_reddit_host && url.contains("/comments/")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchError> {
|
pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchError> {
|
||||||
let json_url = build_json_url(url);
|
let fetch_url = crate::reddit::to_old_reddit_url(url);
|
||||||
let resp = client.fetch(&json_url).await?;
|
let resp = client.fetch(&fetch_url).await?;
|
||||||
if resp.status != 200 {
|
if resp.status != 200 {
|
||||||
return Err(FetchError::Build(format!(
|
return Err(FetchError::Build(format!(
|
||||||
"reddit api returned status {}",
|
"reddit: unexpected status {}",
|
||||||
resp.status
|
resp.status
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
let listings: Vec<Listing> = serde_json::from_str(&resp.html)
|
let thread = webclaw_core::reddit::try_extract_thread(&resp.html, url).ok_or_else(|| {
|
||||||
.map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?;
|
FetchError::BodyDecode(
|
||||||
|
"reddit: page structure not recognised — is this a thread URL?".into(),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
if listings.is_empty() {
|
serde_json::to_value(&thread)
|
||||||
return Err(FetchError::BodyDecode("reddit response empty".into()));
|
.map_err(|e| FetchError::BodyDecode(format!("reddit: serialisation error: {e}")))
|
||||||
}
|
|
||||||
|
|
||||||
// First listing = the post (single t3 child).
|
|
||||||
let post = listings
|
|
||||||
.first()
|
|
||||||
.and_then(|l| l.data.children.first())
|
|
||||||
.filter(|t| t.kind == "t3")
|
|
||||||
.map(|t| post_json(&t.data))
|
|
||||||
.unwrap_or(Value::Null);
|
|
||||||
|
|
||||||
// Second listing = the comment tree.
|
|
||||||
let comments: Vec<Value> = listings
|
|
||||||
.get(1)
|
|
||||||
.map(|l| l.data.children.iter().filter_map(comment_json).collect())
|
|
||||||
.unwrap_or_default();
|
|
||||||
|
|
||||||
Ok(json!({
|
|
||||||
"url": url,
|
|
||||||
"post": post,
|
|
||||||
"comments": comments,
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// JSON shapers
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
fn post_json(d: &ThingData) -> Value {
|
|
||||||
json!({
|
|
||||||
"id": d.id,
|
|
||||||
"title": d.title,
|
|
||||||
"author": d.author,
|
|
||||||
"subreddit": d.subreddit_name_prefixed,
|
|
||||||
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
|
|
||||||
"url": d.url_overridden_by_dest,
|
|
||||||
"is_self": d.is_self,
|
|
||||||
"selftext": d.selftext,
|
|
||||||
"score": d.score,
|
|
||||||
"upvote_ratio": d.upvote_ratio,
|
|
||||||
"num_comments": d.num_comments,
|
|
||||||
"created_utc": d.created_utc,
|
|
||||||
"link_flair_text": d.link_flair_text,
|
|
||||||
"over_18": d.over_18,
|
|
||||||
"spoiler": d.spoiler,
|
|
||||||
"stickied": d.stickied,
|
|
||||||
"locked": d.locked,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Render a single comment + its reply tree. Returns `None` for non-t1
|
|
||||||
/// kinds (the trailing `more` placeholder Reddit injects at depth limits).
|
|
||||||
fn comment_json(thing: &Thing) -> Option<Value> {
|
|
||||||
if thing.kind != "t1" {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
let d = &thing.data;
|
|
||||||
let replies: Vec<Value> = match &d.replies {
|
|
||||||
Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(),
|
|
||||||
_ => Vec::new(),
|
|
||||||
};
|
|
||||||
Some(json!({
|
|
||||||
"id": d.id,
|
|
||||||
"author": d.author,
|
|
||||||
"body": d.body,
|
|
||||||
"score": d.score,
|
|
||||||
"created_utc": d.created_utc,
|
|
||||||
"is_submitter": d.is_submitter,
|
|
||||||
"stickied": d.stickied,
|
|
||||||
"depth": d.depth,
|
|
||||||
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
|
|
||||||
"replies": replies,
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// URL helpers
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
fn host_of(url: &str) -> &str {
|
|
||||||
url.split("://")
|
|
||||||
.nth(1)
|
|
||||||
.unwrap_or(url)
|
|
||||||
.split('/')
|
|
||||||
.next()
|
|
||||||
.unwrap_or("")
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com`
|
|
||||||
/// or `old.reddit.com` as the caller gave us). Routing through
|
|
||||||
/// `old.reddit.com` unconditionally looks appealing but that host has
|
|
||||||
/// stricter UA-based blocking than `www.reddit.com`, while the main
|
|
||||||
/// host accepts our Chrome-fingerprinted client fine.
|
|
||||||
fn build_json_url(url: &str) -> String {
|
|
||||||
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
|
||||||
format!("{clean}.json?raw_json=1")
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Reddit JSON types — only fields we render. Everything else is dropped.
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct Listing {
|
|
||||||
data: ListingData,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct ListingData {
|
|
||||||
children: Vec<Thing>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct Thing {
|
|
||||||
kind: String,
|
|
||||||
data: ThingData,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize, Default)]
|
|
||||||
struct ThingData {
|
|
||||||
// post (t3)
|
|
||||||
id: Option<String>,
|
|
||||||
title: Option<String>,
|
|
||||||
selftext: Option<String>,
|
|
||||||
subreddit_name_prefixed: Option<String>,
|
|
||||||
url_overridden_by_dest: Option<String>,
|
|
||||||
is_self: Option<bool>,
|
|
||||||
upvote_ratio: Option<f64>,
|
|
||||||
num_comments: Option<i64>,
|
|
||||||
over_18: Option<bool>,
|
|
||||||
spoiler: Option<bool>,
|
|
||||||
stickied: Option<bool>,
|
|
||||||
locked: Option<bool>,
|
|
||||||
link_flair_text: Option<String>,
|
|
||||||
|
|
||||||
// comment (t1)
|
|
||||||
author: Option<String>,
|
|
||||||
body: Option<String>,
|
|
||||||
score: Option<i64>,
|
|
||||||
created_utc: Option<f64>,
|
|
||||||
is_submitter: Option<bool>,
|
|
||||||
depth: Option<i64>,
|
|
||||||
permalink: Option<String>,
|
|
||||||
|
|
||||||
// recursive
|
|
||||||
replies: Option<Replies>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
#[serde(untagged)]
|
|
||||||
enum Replies {
|
|
||||||
Listing(Listing),
|
|
||||||
#[allow(dead_code)]
|
|
||||||
Empty(String),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
@ -207,28 +50,17 @@ mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn matches_reddit_post_urls() {
|
fn matches_thread_urls() {
|
||||||
assert!(matches(
|
assert!(matches(
|
||||||
"https://www.reddit.com/r/rust/comments/abc123/some_title/"
|
"https://www.reddit.com/r/rust/comments/abc123/some_title/"
|
||||||
));
|
));
|
||||||
assert!(matches(
|
|
||||||
"https://reddit.com/r/rust/comments/abc123/some_title"
|
|
||||||
));
|
|
||||||
assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
|
assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
|
||||||
|
assert!(matches("https://reddit.com/r/rust/comments/abc/x"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn rejects_non_post_reddit_urls() {
|
fn rejects_listing_and_non_reddit() {
|
||||||
assert!(!matches("https://www.reddit.com/r/rust"));
|
assert!(!matches("https://www.reddit.com/r/rust"));
|
||||||
assert!(!matches("https://www.reddit.com/user/foo"));
|
assert!(!matches("https://example.com/r/rust/comments/abc/x"));
|
||||||
assert!(!matches("https://example.com/r/rust/comments/x"));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn json_url_appends_suffix_and_drops_query() {
|
|
||||||
assert_eq!(
|
|
||||||
build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"),
|
|
||||||
"https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -10,10 +10,12 @@ pub mod error;
|
||||||
pub mod extractors;
|
pub mod extractors;
|
||||||
pub mod fetcher;
|
pub mod fetcher;
|
||||||
pub mod linkedin;
|
pub mod linkedin;
|
||||||
|
pub mod locale;
|
||||||
pub mod proxy;
|
pub mod proxy;
|
||||||
pub mod reddit;
|
pub mod reddit;
|
||||||
pub mod sitemap;
|
pub mod sitemap;
|
||||||
pub mod tls;
|
pub mod tls;
|
||||||
|
pub mod url_security;
|
||||||
|
|
||||||
pub use browser::BrowserProfile;
|
pub use browser::BrowserProfile;
|
||||||
pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
|
pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
|
||||||
|
|
@ -21,6 +23,7 @@ pub use crawler::{CrawlConfig, CrawlResult, CrawlState, Crawler, PageResult};
|
||||||
pub use error::FetchError;
|
pub use error::FetchError;
|
||||||
pub use fetcher::Fetcher;
|
pub use fetcher::Fetcher;
|
||||||
pub use http::HeaderMap;
|
pub use http::HeaderMap;
|
||||||
|
pub use locale::{accept_language_for_tld, accept_language_for_url};
|
||||||
pub use proxy::{parse_proxy_file, parse_proxy_line};
|
pub use proxy::{parse_proxy_file, parse_proxy_line};
|
||||||
pub use sitemap::SitemapEntry;
|
pub use sitemap::SitemapEntry;
|
||||||
pub use webclaw_pdf::PdfMode;
|
pub use webclaw_pdf::PdfMode;
|
||||||
|
|
|
||||||
77
crates/webclaw-fetch/src/locale.rs
Normal file
77
crates/webclaw-fetch/src/locale.rs
Normal file
|
|
@ -0,0 +1,77 @@
|
||||||
|
//! Derive an `Accept-Language` header from a URL.
|
||||||
|
//!
|
||||||
|
//! DataDome-class bot detection on country-specific sites (e.g. immobiliare.it,
|
||||||
|
//! leboncoin.fr) does a geo-vs-locale sanity check: residential IP in the
|
||||||
|
//! target country + a browser UA but the wrong `Accept-Language` is a bot
|
||||||
|
//! signal. Matching the site's expected locale gets us through.
|
||||||
|
//!
|
||||||
|
//! Default for unmapped TLDs is `en-US,en;q=0.9` — the global fallback.
|
||||||
|
|
||||||
|
/// Best-effort `Accept-Language` header value for the given URL's TLD.
|
||||||
|
/// Returns `None` if the URL cannot be parsed.
|
||||||
|
pub fn accept_language_for_url(url: &str) -> Option<&'static str> {
|
||||||
|
let host = url::Url::parse(url).ok()?.host_str()?.to_ascii_lowercase();
|
||||||
|
let tld = host.rsplit('.').next()?;
|
||||||
|
Some(accept_language_for_tld(tld))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Map a bare TLD like `it`, `fr`, `de` to a plausible `Accept-Language`.
|
||||||
|
/// Unknown TLDs fall back to US English.
|
||||||
|
pub fn accept_language_for_tld(tld: &str) -> &'static str {
|
||||||
|
match tld {
|
||||||
|
"it" => "it-IT,it;q=0.9",
|
||||||
|
"fr" => "fr-FR,fr;q=0.9",
|
||||||
|
"de" | "at" => "de-DE,de;q=0.9",
|
||||||
|
"es" => "es-ES,es;q=0.9",
|
||||||
|
"pt" => "pt-PT,pt;q=0.9",
|
||||||
|
"nl" => "nl-NL,nl;q=0.9",
|
||||||
|
"pl" => "pl-PL,pl;q=0.9",
|
||||||
|
"se" => "sv-SE,sv;q=0.9",
|
||||||
|
"no" => "nb-NO,nb;q=0.9",
|
||||||
|
"dk" => "da-DK,da;q=0.9",
|
||||||
|
"fi" => "fi-FI,fi;q=0.9",
|
||||||
|
"cz" => "cs-CZ,cs;q=0.9",
|
||||||
|
"ro" => "ro-RO,ro;q=0.9",
|
||||||
|
"gr" => "el-GR,el;q=0.9",
|
||||||
|
"tr" => "tr-TR,tr;q=0.9",
|
||||||
|
"ru" => "ru-RU,ru;q=0.9",
|
||||||
|
"jp" => "ja-JP,ja;q=0.9",
|
||||||
|
"kr" => "ko-KR,ko;q=0.9",
|
||||||
|
"cn" => "zh-CN,zh;q=0.9",
|
||||||
|
"tw" | "hk" => "zh-TW,zh;q=0.9",
|
||||||
|
"br" => "pt-BR,pt;q=0.9",
|
||||||
|
"mx" | "ar" | "co" | "cl" | "pe" => "es-ES,es;q=0.9",
|
||||||
|
"uk" | "ie" => "en-GB,en;q=0.9",
|
||||||
|
_ => "en-US,en;q=0.9",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tld_dispatch() {
|
||||||
|
assert_eq!(
|
||||||
|
accept_language_for_url("https://www.immobiliare.it/annunci/1"),
|
||||||
|
Some("it-IT,it;q=0.9")
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
accept_language_for_url("https://www.leboncoin.fr/"),
|
||||||
|
Some("fr-FR,fr;q=0.9")
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
accept_language_for_url("https://www.amazon.co.uk/"),
|
||||||
|
Some("en-GB,en;q=0.9")
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
accept_language_for_url("https://example.com/"),
|
||||||
|
Some("en-US,en;q=0.9")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bad_url_returns_none() {
|
||||||
|
assert_eq!(accept_language_for_url("not-a-url"), None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,172 +1,56 @@
|
||||||
/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
|
//! Reddit URL helpers for the fetch layer.
|
||||||
///
|
//!
|
||||||
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
|
//! The JSON API (`*.json`) is blocked. We rewrite all Reddit hosts to
|
||||||
/// loaded client-side. Appending `.json` to any Reddit URL returns the full
|
//! `old.reddit.com`, which serves stable server-rendered HTML that
|
||||||
/// comment tree as structured JSON, which we convert to clean markdown.
|
//! `webclaw-core::reddit` parses directly.
|
||||||
use serde::Deserialize;
|
|
||||||
use tracing::debug;
|
|
||||||
use webclaw_core::{Content, ExtractionResult, Metadata};
|
|
||||||
|
|
||||||
/// Check if a URL points to a Reddit post/comment page.
|
|
||||||
pub fn is_reddit_url(url: &str) -> bool {
|
pub fn is_reddit_url(url: &str) -> bool {
|
||||||
let host = url
|
webclaw_core::reddit::is_reddit_url(url)
|
||||||
.split("://")
|
|
||||||
.nth(1)
|
|
||||||
.unwrap_or(url)
|
|
||||||
.split('/')
|
|
||||||
.next()
|
|
||||||
.unwrap_or("");
|
|
||||||
matches!(
|
|
||||||
host,
|
|
||||||
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build the `.json` URL from a Reddit page URL.
|
/// Rewrite any Reddit host to old.reddit.com, preserving path and query.
|
||||||
pub fn json_url(url: &str) -> String {
|
pub fn to_old_reddit_url(url: &str) -> String {
|
||||||
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
let Some(scheme_end) = url.find("://") else {
|
||||||
format!("{clean}.json")
|
return url.to_string();
|
||||||
|
};
|
||||||
|
let after = &url[scheme_end + 3..];
|
||||||
|
let host_end = after.find(['/', '?', '#']).unwrap_or(after.len());
|
||||||
|
let scheme = &url[..scheme_end + 3];
|
||||||
|
let rest = &after[host_end..];
|
||||||
|
format!("{scheme}old.reddit.com{rest}")
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert Reddit JSON API response into an ExtractionResult.
|
#[cfg(test)]
|
||||||
pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
|
mod tests {
|
||||||
let listings: Vec<Listing> =
|
use super::*;
|
||||||
serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
|
|
||||||
|
|
||||||
let mut markdown = String::new();
|
#[test]
|
||||||
let mut title = None;
|
fn rewrites_www_to_old() {
|
||||||
let mut author = None;
|
assert_eq!(
|
||||||
let mut subreddit = None;
|
to_old_reddit_url("https://www.reddit.com/r/rust/comments/abc/x/"),
|
||||||
|
"https://old.reddit.com/r/rust/comments/abc/x/"
|
||||||
// First listing = the post itself
|
);
|
||||||
if let Some(post_listing) = listings.first() {
|
|
||||||
for child in &post_listing.data.children {
|
|
||||||
if child.kind == "t3" {
|
|
||||||
let d = &child.data;
|
|
||||||
title = d.title.clone();
|
|
||||||
author = d.author.clone();
|
|
||||||
subreddit = d.subreddit_name_prefixed.clone();
|
|
||||||
|
|
||||||
if let Some(ref t) = title {
|
|
||||||
markdown.push_str(&format!("# {t}\n\n"));
|
|
||||||
}
|
|
||||||
if let (Some(a), Some(sr)) = (&author, &subreddit) {
|
|
||||||
markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
|
|
||||||
}
|
|
||||||
if let Some(ref body) = d.selftext
|
|
||||||
&& !body.is_empty()
|
|
||||||
{
|
|
||||||
markdown.push_str(body);
|
|
||||||
markdown.push_str("\n\n");
|
|
||||||
}
|
|
||||||
if let Some(ref url_field) = d.url_overridden_by_dest
|
|
||||||
&& !url_field.is_empty()
|
|
||||||
{
|
|
||||||
markdown.push_str(&format!("[Link]({url_field})\n\n"));
|
|
||||||
}
|
|
||||||
markdown.push_str("---\n\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Second listing = comment tree
|
#[test]
|
||||||
if let Some(comment_listing) = listings.get(1) {
|
fn rewrites_bare_to_old() {
|
||||||
markdown.push_str("## Comments\n\n");
|
assert_eq!(
|
||||||
for child in &comment_listing.data.children {
|
to_old_reddit_url("https://reddit.com/r/rust/"),
|
||||||
render_comment(child, 0, &mut markdown);
|
"https://old.reddit.com/r/rust/"
|
||||||
}
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let word_count = markdown.split_whitespace().count();
|
#[test]
|
||||||
debug!(word_count, "reddit json extracted");
|
fn preserves_old_reddit_unchanged() {
|
||||||
|
let url = "https://old.reddit.com/r/rust/comments/abc/x/?context=3";
|
||||||
Ok(ExtractionResult {
|
assert_eq!(to_old_reddit_url(url), url);
|
||||||
metadata: Metadata {
|
|
||||||
title,
|
|
||||||
description: None,
|
|
||||||
author,
|
|
||||||
published_date: None,
|
|
||||||
language: Some("en".into()),
|
|
||||||
url: Some(url.to_string()),
|
|
||||||
site_name: subreddit,
|
|
||||||
image: None,
|
|
||||||
favicon: None,
|
|
||||||
word_count,
|
|
||||||
},
|
|
||||||
content: Content {
|
|
||||||
markdown,
|
|
||||||
plain_text: String::new(),
|
|
||||||
links: vec![],
|
|
||||||
images: vec![],
|
|
||||||
code_blocks: vec![],
|
|
||||||
raw_html: None,
|
|
||||||
},
|
|
||||||
domain_data: None,
|
|
||||||
structured_data: vec![],
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
|
|
||||||
if thing.kind != "t1" {
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
let d = &thing.data;
|
|
||||||
let indent = " ".repeat(depth);
|
|
||||||
let author = d.author.as_deref().unwrap_or("[deleted]");
|
|
||||||
let body = d.body.as_deref().unwrap_or("[removed]");
|
|
||||||
let score = d.score.unwrap_or(0);
|
|
||||||
|
|
||||||
out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
|
#[test]
|
||||||
for line in body.lines() {
|
fn preserves_query_and_hash() {
|
||||||
out.push_str(&format!("{indent} {line}\n"));
|
assert_eq!(
|
||||||
}
|
to_old_reddit_url("https://www.reddit.com/r/rust/?sort=top#anchor"),
|
||||||
out.push('\n');
|
"https://old.reddit.com/r/rust/?sort=top#anchor"
|
||||||
|
);
|
||||||
// Recurse into replies
|
|
||||||
if let Some(Replies::Listing(listing)) = &d.replies {
|
|
||||||
for child in &listing.data.children {
|
|
||||||
render_comment(child, depth + 1, out);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Reddit JSON types (minimal) ---
|
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct Listing {
|
|
||||||
data: ListingData,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct ListingData {
|
|
||||||
children: Vec<Thing>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct Thing {
|
|
||||||
kind: String,
|
|
||||||
data: ThingData,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct ThingData {
|
|
||||||
// Post fields (t3)
|
|
||||||
title: Option<String>,
|
|
||||||
selftext: Option<String>,
|
|
||||||
subreddit_name_prefixed: Option<String>,
|
|
||||||
url_overridden_by_dest: Option<String>,
|
|
||||||
// Comment fields (t1)
|
|
||||||
author: Option<String>,
|
|
||||||
body: Option<String>,
|
|
||||||
score: Option<i64>,
|
|
||||||
replies: Option<Replies>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Reddit replies can be either a nested Listing or an empty string.
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
#[serde(untagged)]
|
|
||||||
enum Replies {
|
|
||||||
Listing(Listing),
|
|
||||||
#[allow(dead_code)]
|
|
||||||
Empty(String),
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -5,17 +5,55 @@
|
||||||
//! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order,
|
//! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order,
|
||||||
//! stream dependency, priorities) to match real browser fingerprints.
|
//! stream dependency, priorities) to match real browser fingerprints.
|
||||||
|
|
||||||
use std::time::Duration;
|
use std::{borrow::Cow, io, time::Duration};
|
||||||
|
|
||||||
use wreq::http2::{
|
use wreq::http2::{
|
||||||
Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
|
Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
|
||||||
};
|
};
|
||||||
use wreq::tls::{AlpsProtocol, CertificateCompressionAlgorithm, TlsOptions, TlsVersion};
|
use wreq::tls::{
|
||||||
|
AlpnProtocol, AlpsProtocol, CertificateCompressionAlgorithm, ExtensionType, TlsOptions,
|
||||||
|
TlsVersion,
|
||||||
|
};
|
||||||
use wreq::{Client, Emulation};
|
use wreq::{Client, Emulation};
|
||||||
|
|
||||||
use crate::browser::BrowserVariant;
|
use crate::browser::BrowserVariant;
|
||||||
use crate::error::FetchError;
|
use crate::error::FetchError;
|
||||||
|
|
||||||
|
#[derive(Clone, Default)]
|
||||||
|
struct PublicDnsResolver;
|
||||||
|
|
||||||
|
impl wreq::dns::Resolve for PublicDnsResolver {
|
||||||
|
fn resolve(&self, name: wreq::dns::Name) -> wreq::dns::Resolving {
|
||||||
|
Box::pin(async move {
|
||||||
|
let addrs = tokio::net::lookup_host((name.as_str(), 0))
|
||||||
|
.await
|
||||||
|
.map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)?;
|
||||||
|
let mut public = Vec::new();
|
||||||
|
|
||||||
|
for addr in addrs {
|
||||||
|
if crate::url_security::is_blocked_ip(addr.ip()) {
|
||||||
|
let err: Box<dyn std::error::Error + Send + Sync> = Box::new(io::Error::new(
|
||||||
|
io::ErrorKind::PermissionDenied,
|
||||||
|
"DNS resolved to a blocked private or internal address",
|
||||||
|
));
|
||||||
|
return Err(err);
|
||||||
|
}
|
||||||
|
public.push(addr);
|
||||||
|
}
|
||||||
|
|
||||||
|
if public.is_empty() {
|
||||||
|
let err: Box<dyn std::error::Error + Send + Sync> = Box::new(io::Error::new(
|
||||||
|
io::ErrorKind::NotFound,
|
||||||
|
"host did not resolve to any addresses",
|
||||||
|
));
|
||||||
|
return Err(err);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Box::new(public.into_iter()) as wreq::dns::Addrs)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Chrome cipher list (TLS 1.3 + TLS 1.2 in Chrome's exact order).
|
/// Chrome cipher list (TLS 1.3 + TLS 1.2 in Chrome's exact order).
|
||||||
const CHROME_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";
|
const CHROME_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";
|
||||||
|
|
||||||
|
|
@ -43,6 +81,55 @@ const SAFARI_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkc
|
||||||
/// Safari curves.
|
/// Safari curves.
|
||||||
const SAFARI_CURVES: &str = "X25519:P-256:P-384:P-521";
|
const SAFARI_CURVES: &str = "X25519:P-256:P-384:P-521";
|
||||||
|
|
||||||
|
/// Safari iOS 26 TLS extension order, matching bogdanfinn's
|
||||||
|
/// `safari_ios_26_0` wire format. GREASE slots are omitted. wreq
|
||||||
|
/// inserts them itself. Diverges from wreq-util's default SafariIos26
|
||||||
|
/// extension order, which DataDome's immobiliare.it ruleset flags.
|
||||||
|
fn safari_ios_extensions() -> Vec<ExtensionType> {
|
||||||
|
vec![
|
||||||
|
ExtensionType::CERTIFICATE_TIMESTAMP,
|
||||||
|
ExtensionType::APPLICATION_LAYER_PROTOCOL_NEGOTIATION,
|
||||||
|
ExtensionType::SERVER_NAME,
|
||||||
|
ExtensionType::CERT_COMPRESSION,
|
||||||
|
ExtensionType::KEY_SHARE,
|
||||||
|
ExtensionType::SUPPORTED_VERSIONS,
|
||||||
|
ExtensionType::PSK_KEY_EXCHANGE_MODES,
|
||||||
|
ExtensionType::SUPPORTED_GROUPS,
|
||||||
|
ExtensionType::RENEGOTIATE,
|
||||||
|
ExtensionType::SIGNATURE_ALGORITHMS,
|
||||||
|
ExtensionType::STATUS_REQUEST,
|
||||||
|
ExtensionType::EC_POINT_FORMATS,
|
||||||
|
ExtensionType::EXTENDED_MASTER_SECRET,
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Chrome 133 TLS extension order, matching bogdanfinn's stable JA3
|
||||||
|
/// (`43067709b025da334de1279a120f8e14`). Real Chrome permutes extensions
|
||||||
|
/// per handshake, but indeed.com's WAF allowlists this specific wire order
|
||||||
|
/// and rejects permuted ones. GREASE slots are inserted by wreq.
|
||||||
|
///
|
||||||
|
/// JA3 extension field from peet.ws: 18-5-35-51-10-45-11-27-17613-43-13-0-16-65037-65281-23
|
||||||
|
fn chrome_extensions() -> Vec<ExtensionType> {
|
||||||
|
vec![
|
||||||
|
ExtensionType::CERTIFICATE_TIMESTAMP, // 18
|
||||||
|
ExtensionType::STATUS_REQUEST, // 5
|
||||||
|
ExtensionType::SESSION_TICKET, // 35
|
||||||
|
ExtensionType::KEY_SHARE, // 51
|
||||||
|
ExtensionType::SUPPORTED_GROUPS, // 10
|
||||||
|
ExtensionType::PSK_KEY_EXCHANGE_MODES, // 45
|
||||||
|
ExtensionType::EC_POINT_FORMATS, // 11
|
||||||
|
ExtensionType::CERT_COMPRESSION, // 27
|
||||||
|
ExtensionType::APPLICATION_SETTINGS_NEW, // 17613 (new codepoint, matches alps_use_new_codepoint)
|
||||||
|
ExtensionType::SUPPORTED_VERSIONS, // 43
|
||||||
|
ExtensionType::SIGNATURE_ALGORITHMS, // 13
|
||||||
|
ExtensionType::SERVER_NAME, // 0
|
||||||
|
ExtensionType::APPLICATION_LAYER_PROTOCOL_NEGOTIATION, // 16
|
||||||
|
ExtensionType::ENCRYPTED_CLIENT_HELLO, // 65037
|
||||||
|
ExtensionType::RENEGOTIATE, // 65281
|
||||||
|
ExtensionType::EXTENDED_MASTER_SECRET, // 23
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
// --- Chrome HTTP headers in correct wire order ---
|
// --- Chrome HTTP headers in correct wire order ---
|
||||||
|
|
||||||
const CHROME_HEADERS: &[(&str, &str)] = &[
|
const CHROME_HEADERS: &[(&str, &str)] = &[
|
||||||
|
|
@ -130,6 +217,26 @@ const SAFARI_HEADERS: &[(&str, &str)] = &[
|
||||||
("sec-fetch-dest", "document"),
|
("sec-fetch-dest", "document"),
|
||||||
];
|
];
|
||||||
|
|
||||||
|
/// Safari iOS 26 headers, in the wire order real Safari emits. Critically:
|
||||||
|
/// NO `sec-fetch-*`, NO `priority: u=0, i` (both Chromium-only leaks), but
|
||||||
|
/// `upgrade-insecure-requests: 1` is present. `accept-encoding` does not
|
||||||
|
/// include zstd (Safari can't decode it). Verified against bogdanfinn on
|
||||||
|
/// 2026-04-22: this header set is what DataDome's immobiliare ruleset
|
||||||
|
/// expects for a real iPhone.
|
||||||
|
const SAFARI_IOS_HEADERS: &[(&str, &str)] = &[
|
||||||
|
(
|
||||||
|
"accept",
|
||||||
|
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
),
|
||||||
|
("accept-language", "en-US,en;q=0.9"),
|
||||||
|
("accept-encoding", "gzip, deflate, br"),
|
||||||
|
(
|
||||||
|
"user-agent",
|
||||||
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 26_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/26.0 Mobile/15E148 Safari/604.1",
|
||||||
|
),
|
||||||
|
("upgrade-insecure-requests", "1"),
|
||||||
|
];
|
||||||
|
|
||||||
const EDGE_HEADERS: &[(&str, &str)] = &[
|
const EDGE_HEADERS: &[(&str, &str)] = &[
|
||||||
(
|
(
|
||||||
"sec-ch-ua",
|
"sec-ch-ua",
|
||||||
|
|
@ -156,6 +263,9 @@ const EDGE_HEADERS: &[(&str, &str)] = &[
|
||||||
];
|
];
|
||||||
|
|
||||||
fn chrome_tls() -> TlsOptions {
|
fn chrome_tls() -> TlsOptions {
|
||||||
|
// permute_extensions is off so the explicit extension_permutation sticks.
|
||||||
|
// Real Chrome permutes, but indeed.com's WAF allowlists bogdanfinn's
|
||||||
|
// fixed order, so matching that gets us through.
|
||||||
TlsOptions::builder()
|
TlsOptions::builder()
|
||||||
.cipher_list(CHROME_CIPHERS)
|
.cipher_list(CHROME_CIPHERS)
|
||||||
.sigalgs_list(CHROME_SIGALGS)
|
.sigalgs_list(CHROME_SIGALGS)
|
||||||
|
|
@ -163,12 +273,18 @@ fn chrome_tls() -> TlsOptions {
|
||||||
.min_tls_version(TlsVersion::TLS_1_2)
|
.min_tls_version(TlsVersion::TLS_1_2)
|
||||||
.max_tls_version(TlsVersion::TLS_1_3)
|
.max_tls_version(TlsVersion::TLS_1_3)
|
||||||
.grease_enabled(true)
|
.grease_enabled(true)
|
||||||
.permute_extensions(true)
|
.permute_extensions(false)
|
||||||
|
.extension_permutation(chrome_extensions())
|
||||||
.enable_ech_grease(true)
|
.enable_ech_grease(true)
|
||||||
.pre_shared_key(true)
|
.pre_shared_key(true)
|
||||||
.enable_ocsp_stapling(true)
|
.enable_ocsp_stapling(true)
|
||||||
.enable_signed_cert_timestamps(true)
|
.enable_signed_cert_timestamps(true)
|
||||||
.alps_protocols([AlpsProtocol::HTTP2])
|
.alpn_protocols([
|
||||||
|
AlpnProtocol::HTTP3,
|
||||||
|
AlpnProtocol::HTTP2,
|
||||||
|
AlpnProtocol::HTTP1,
|
||||||
|
])
|
||||||
|
.alps_protocols([AlpsProtocol::HTTP3, AlpsProtocol::HTTP2])
|
||||||
.alps_use_new_codepoint(true)
|
.alps_use_new_codepoint(true)
|
||||||
.aes_hw_override(true)
|
.aes_hw_override(true)
|
||||||
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::BROTLI])
|
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::BROTLI])
|
||||||
|
|
@ -212,25 +328,70 @@ fn safari_tls() -> TlsOptions {
|
||||||
.build()
|
.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Safari iOS 26 emulation — composed on top of `wreq_util::Emulation::SafariIos26`
|
||||||
|
/// with four targeted overrides. We don't hand-roll this one like Chrome/Firefox
|
||||||
|
/// because the wire-level defaults from wreq-util are already correct for ciphers,
|
||||||
|
/// sigalgs, curves, and GREASE — the four things wreq-util gets *wrong* for
|
||||||
|
/// DataDome compatibility are overridden here:
|
||||||
|
///
|
||||||
|
/// 1. TLS extension order: match bogdanfinn `safari_ios_26_0` exactly (JA3
|
||||||
|
/// ends up `8d909525bd5bbb79f133d11cc05159fe`).
|
||||||
|
/// 2. HTTP/2 HEADERS priority flag: weight=256, exclusive=1, depends_on=0.
|
||||||
|
/// wreq-util omits this frame; real Safari and bogdanfinn include it.
|
||||||
|
/// This flip is the thing DataDome actually reads — the akamai_fingerprint
|
||||||
|
/// hash changes from `c52879e43202aeb92740be6e8c86ea96` to
|
||||||
|
/// `d1294410a06522e37a5c5e3f0a45a705`, which is the winning signature.
|
||||||
|
/// 3. Headers: strip wreq-util's Chromium defaults (`sec-fetch-*`,
|
||||||
|
/// `priority: u=0, i`, zstd), replace with the real iOS 26 set.
|
||||||
|
/// 4. `accept-language` preserved from config.extra_headers for locale.
|
||||||
|
fn safari_ios_emulation() -> wreq::Emulation {
|
||||||
|
use wreq::EmulationFactory;
|
||||||
|
let mut em = wreq_util::Emulation::SafariIos26.emulation();
|
||||||
|
|
||||||
|
if let Some(tls) = em.tls_options_mut().as_mut() {
|
||||||
|
tls.extension_permutation = Some(Cow::Owned(safari_ios_extensions()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only override the priority flag — keep wreq-util's SETTINGS, WINDOW_UPDATE,
|
||||||
|
// and pseudo-order intact. Replacing the whole Http2Options resets SETTINGS
|
||||||
|
// to defaults, which sends only INITIAL_WINDOW_SIZE and fails DataDome.
|
||||||
|
if let Some(h2) = em.http2_options_mut().as_mut() {
|
||||||
|
h2.headers_stream_dependency = Some(StreamDependency::new(StreamId::zero(), 255, true));
|
||||||
|
}
|
||||||
|
|
||||||
|
let hm = em.headers_mut();
|
||||||
|
hm.clear();
|
||||||
|
for (k, v) in SAFARI_IOS_HEADERS {
|
||||||
|
if let (Ok(n), Ok(val)) = (
|
||||||
|
http::header::HeaderName::from_bytes(k.as_bytes()),
|
||||||
|
http::header::HeaderValue::from_str(v),
|
||||||
|
) {
|
||||||
|
hm.append(n, val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
em
|
||||||
|
}
|
||||||
|
|
||||||
fn chrome_h2() -> Http2Options {
|
fn chrome_h2() -> Http2Options {
|
||||||
|
// SETTINGS frame matches bogdanfinn `chrome_133`: HEADER_TABLE_SIZE,
|
||||||
|
// ENABLE_PUSH=0, INITIAL_WINDOW_SIZE, MAX_HEADER_LIST_SIZE. No
|
||||||
|
// MAX_CONCURRENT_STREAMS — real Chrome 133 and bogdanfinn both omit it,
|
||||||
|
// and indeed.com's WAF reads this as a bot signal when present. Priority
|
||||||
|
// weight 256 (encoded as 255 + 1) matches bogdanfinn's HEADERS frame.
|
||||||
Http2Options::builder()
|
Http2Options::builder()
|
||||||
.initial_window_size(6_291_456)
|
.initial_window_size(6_291_456)
|
||||||
.initial_connection_window_size(15_728_640)
|
.initial_connection_window_size(15_728_640)
|
||||||
.max_header_list_size(262_144)
|
.max_header_list_size(262_144)
|
||||||
.header_table_size(65_536)
|
.header_table_size(65_536)
|
||||||
.max_concurrent_streams(1000u32)
|
|
||||||
.enable_push(false)
|
.enable_push(false)
|
||||||
.settings_order(
|
.settings_order(
|
||||||
SettingsOrder::builder()
|
SettingsOrder::builder()
|
||||||
.extend([
|
.extend([
|
||||||
SettingId::HeaderTableSize,
|
SettingId::HeaderTableSize,
|
||||||
SettingId::EnablePush,
|
SettingId::EnablePush,
|
||||||
SettingId::MaxConcurrentStreams,
|
|
||||||
SettingId::InitialWindowSize,
|
SettingId::InitialWindowSize,
|
||||||
SettingId::MaxFrameSize,
|
|
||||||
SettingId::MaxHeaderListSize,
|
SettingId::MaxHeaderListSize,
|
||||||
SettingId::EnableConnectProtocol,
|
|
||||||
SettingId::NoRfc7540Priorities,
|
|
||||||
])
|
])
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
@ -244,7 +405,7 @@ fn chrome_h2() -> Http2Options {
|
||||||
])
|
])
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
.headers_stream_dependency(StreamDependency::new(StreamId::zero(), 219, true))
|
.headers_stream_dependency(StreamDependency::new(StreamId::zero(), 255, true))
|
||||||
.build()
|
.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -327,46 +488,139 @@ pub fn build_client(
|
||||||
timeout: Duration,
|
timeout: Duration,
|
||||||
extra_headers: &std::collections::HashMap<String, String>,
|
extra_headers: &std::collections::HashMap<String, String>,
|
||||||
proxy: Option<&str>,
|
proxy: Option<&str>,
|
||||||
|
follow_redirects: bool,
|
||||||
|
max_redirects: u32,
|
||||||
) -> Result<Client, FetchError> {
|
) -> Result<Client, FetchError> {
|
||||||
let (tls, h2, headers) = match variant {
|
// SafariIos26 builds its Emulation on top of wreq-util's base instead
|
||||||
BrowserVariant::Chrome => (chrome_tls(), chrome_h2(), CHROME_HEADERS),
|
// of from scratch. See `safari_ios_emulation` for why.
|
||||||
BrowserVariant::ChromeMacos => (chrome_tls(), chrome_h2(), CHROME_MACOS_HEADERS),
|
let mut emulation = match variant {
|
||||||
BrowserVariant::Firefox => (firefox_tls(), firefox_h2(), FIREFOX_HEADERS),
|
BrowserVariant::SafariIos26 => safari_ios_emulation(),
|
||||||
BrowserVariant::Safari => (safari_tls(), safari_h2(), SAFARI_HEADERS),
|
other => {
|
||||||
BrowserVariant::Edge => (chrome_tls(), chrome_h2(), EDGE_HEADERS),
|
let (tls, h2, headers) = match other {
|
||||||
|
BrowserVariant::Chrome => (chrome_tls(), chrome_h2(), CHROME_HEADERS),
|
||||||
|
BrowserVariant::ChromeMacos => (chrome_tls(), chrome_h2(), CHROME_MACOS_HEADERS),
|
||||||
|
BrowserVariant::Firefox => (firefox_tls(), firefox_h2(), FIREFOX_HEADERS),
|
||||||
|
BrowserVariant::Safari => (safari_tls(), safari_h2(), SAFARI_HEADERS),
|
||||||
|
BrowserVariant::Edge => (chrome_tls(), chrome_h2(), EDGE_HEADERS),
|
||||||
|
BrowserVariant::SafariIos26 => unreachable!("handled above"),
|
||||||
|
};
|
||||||
|
Emulation::builder()
|
||||||
|
.tls_options(tls)
|
||||||
|
.http2_options(h2)
|
||||||
|
.headers(build_headers(headers))
|
||||||
|
.build()
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut header_map = build_headers(headers);
|
// Append extra headers after profile defaults.
|
||||||
|
let hm = emulation.headers_mut();
|
||||||
// Append extra headers after profile defaults
|
|
||||||
for (k, v) in extra_headers {
|
for (k, v) in extra_headers {
|
||||||
if let (Ok(n), Ok(val)) = (
|
if let (Ok(n), Ok(val)) = (
|
||||||
http::header::HeaderName::from_bytes(k.as_bytes()),
|
http::header::HeaderName::from_bytes(k.as_bytes()),
|
||||||
http::header::HeaderValue::from_str(v),
|
http::header::HeaderValue::from_str(v),
|
||||||
) {
|
) {
|
||||||
header_map.insert(n, val);
|
hm.insert(n, val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let emulation = Emulation::builder()
|
|
||||||
.tls_options(tls)
|
|
||||||
.http2_options(h2)
|
|
||||||
.headers(header_map)
|
|
||||||
.build();
|
|
||||||
|
|
||||||
let mut builder = Client::builder()
|
let mut builder = Client::builder()
|
||||||
.emulation(emulation)
|
.emulation(emulation)
|
||||||
.redirect(wreq::redirect::Policy::limited(10))
|
.redirect(ssrf_safe_redirect_policy(
|
||||||
|
follow_redirects,
|
||||||
|
max_redirects as usize,
|
||||||
|
))
|
||||||
.cookie_store(true)
|
.cookie_store(true)
|
||||||
.timeout(timeout);
|
.timeout(timeout);
|
||||||
|
|
||||||
if let Some(proxy_url) = proxy {
|
if let Some(proxy_url) = proxy {
|
||||||
let proxy =
|
let proxy = wreq::Proxy::all(proxy_url).map_err(|_| {
|
||||||
wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
|
FetchError::Build(format!("invalid proxy {}", redact_proxy_url(proxy_url)))
|
||||||
|
})?;
|
||||||
builder = builder.proxy(proxy);
|
builder = builder.proxy(proxy);
|
||||||
|
} else {
|
||||||
|
builder = builder.dns_resolver(PublicDnsResolver);
|
||||||
}
|
}
|
||||||
|
|
||||||
builder
|
builder
|
||||||
.build()
|
.build()
|
||||||
.map_err(|e| FetchError::Build(e.to_string()))
|
.map_err(|e| FetchError::Build(e.to_string()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Render a proxy URL safe to log: drop any `user:pass@` userinfo so
|
||||||
|
/// rotating-proxy credentials never reach error strings or tracing.
|
||||||
|
/// Falls back to a constant placeholder when the input does not parse.
|
||||||
|
fn redact_proxy_url(raw: &str) -> String {
|
||||||
|
match url::Url::parse(raw) {
|
||||||
|
Ok(mut u) => {
|
||||||
|
// Best-effort: opaque URLs (e.g. no host) reject these setters;
|
||||||
|
// in that case fall through to the placeholder rather than risk
|
||||||
|
// returning the raw string with credentials.
|
||||||
|
if u.set_username("").is_err() || u.set_password(None).is_err() {
|
||||||
|
return "<proxy redacted>".to_string();
|
||||||
|
}
|
||||||
|
u.to_string()
|
||||||
|
}
|
||||||
|
Err(_) => "<proxy redacted>".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn ssrf_safe_redirect_policy(
|
||||||
|
follow_redirects: bool,
|
||||||
|
max_redirects: usize,
|
||||||
|
) -> wreq::redirect::Policy {
|
||||||
|
if !follow_redirects {
|
||||||
|
return wreq::redirect::Policy::none();
|
||||||
|
}
|
||||||
|
|
||||||
|
wreq::redirect::Policy::custom(move |attempt| {
|
||||||
|
if attempt.previous.len() > max_redirects {
|
||||||
|
return attempt.error("too many redirects");
|
||||||
|
}
|
||||||
|
|
||||||
|
attempt.pending(|attempt| async move {
|
||||||
|
let next_url = attempt.uri.to_string();
|
||||||
|
match crate::url_security::validate_public_http_url(&next_url).await {
|
||||||
|
Ok(_) => attempt.follow(),
|
||||||
|
Err(e) => attempt.error(e.to_string()),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::redact_proxy_url;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn redacts_userinfo_from_proxy_url() {
|
||||||
|
let red = redact_proxy_url("http://user123:s3cr3tPass@proxy.example.com:8080");
|
||||||
|
assert!(!red.contains("user123"), "username leaked: {red}");
|
||||||
|
assert!(!red.contains("s3cr3tPass"), "password leaked: {red}");
|
||||||
|
assert!(red.contains("proxy.example.com"), "host lost: {red}");
|
||||||
|
assert!(red.contains("8080"), "port lost: {red}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn redacts_long_token_residential_proxy() {
|
||||||
|
// Residential-style: long structured credential with embedded
|
||||||
|
// tokens in the username and special chars in the password.
|
||||||
|
let red =
|
||||||
|
redact_proxy_url("http://acct-zone-resi-country-xx:p@ss-word@gw.proxy.example:12321");
|
||||||
|
assert!(!red.contains("acct-zone-resi"), "username leaked: {red}");
|
||||||
|
assert!(!red.contains("p@ss-word"), "password leaked: {red}");
|
||||||
|
assert!(red.contains("gw.proxy.example"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unparseable_proxy_does_not_echo_input() {
|
||||||
|
let red = redact_proxy_url("user:pass@not a url");
|
||||||
|
assert_eq!(red, "<proxy redacted>");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn proxy_without_credentials_is_preserved() {
|
||||||
|
let red = redact_proxy_url("http://proxy.example.com:3128");
|
||||||
|
assert!(red.contains("proxy.example.com"));
|
||||||
|
assert!(red.contains("3128"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
203
crates/webclaw-fetch/src/url_security.rs
Normal file
203
crates/webclaw-fetch/src/url_security.rs
Normal file
|
|
@ -0,0 +1,203 @@
|
||||||
|
//! SSRF guard for every server-side fetch.
|
||||||
|
//!
|
||||||
|
//! Callers may still do cheap parse validation at the edge, but this
|
||||||
|
//! module is the fetch-layer authority because redirects and helper
|
||||||
|
//! fetches also pass through it.
|
||||||
|
|
||||||
|
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
|
||||||
|
|
||||||
|
use tokio::net::lookup_host;
|
||||||
|
use url::{Host, Url};
|
||||||
|
|
||||||
|
use crate::error::FetchError;
|
||||||
|
|
||||||
|
/// Parse a caller-provided URL and require an HTTP(S) host.
|
||||||
|
pub fn validate_http_url(raw: &str) -> Result<Url, FetchError> {
|
||||||
|
let trimmed = raw.trim();
|
||||||
|
if trimmed.is_empty() {
|
||||||
|
return Err(FetchError::InvalidUrl("URL must not be empty".into()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let parsed =
|
||||||
|
Url::parse(trimmed).map_err(|e| FetchError::InvalidUrl(format!("invalid URL: {e}")))?;
|
||||||
|
match parsed.scheme() {
|
||||||
|
"http" | "https" => {}
|
||||||
|
scheme => {
|
||||||
|
return Err(FetchError::InvalidUrl(format!(
|
||||||
|
"scheme '{scheme}' is not allowed, use http:// or https://"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if parsed.host().is_none() {
|
||||||
|
return Err(FetchError::InvalidUrl("URL must include a host".into()));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(parsed)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse, resolve, and reject private/internal destinations.
|
||||||
|
///
|
||||||
|
/// A domain is rejected if any resolved address is private or reserved.
|
||||||
|
/// That is intentionally conservative: mixed public/private DNS answers
|
||||||
|
/// are unsafe for server-side fetching.
|
||||||
|
pub async fn validate_public_http_url(raw: &str) -> Result<Url, FetchError> {
|
||||||
|
let parsed = validate_http_url(raw)?;
|
||||||
|
validate_url_host_is_public(&parsed).await?;
|
||||||
|
Ok(parsed)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn validate_url_host_is_public(url: &Url) -> Result<(), FetchError> {
|
||||||
|
match url.host() {
|
||||||
|
Some(Host::Ipv4(ip)) => reject_blocked_ip(IpAddr::V4(ip)),
|
||||||
|
Some(Host::Ipv6(ip)) => reject_blocked_ip(IpAddr::V6(ip)),
|
||||||
|
Some(Host::Domain(host)) => {
|
||||||
|
let port = url
|
||||||
|
.port_or_known_default()
|
||||||
|
.ok_or_else(|| FetchError::InvalidUrl("URL must include a known port".into()))?;
|
||||||
|
let addrs = lookup_host((host, port))
|
||||||
|
.await
|
||||||
|
.map_err(|e| FetchError::InvalidUrl(format!("failed to resolve host: {e}")))?;
|
||||||
|
|
||||||
|
let mut resolved = false;
|
||||||
|
for addr in addrs {
|
||||||
|
resolved = true;
|
||||||
|
reject_blocked_ip(addr.ip())?;
|
||||||
|
}
|
||||||
|
if !resolved {
|
||||||
|
return Err(FetchError::InvalidUrl(
|
||||||
|
"host did not resolve to any addresses".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
None => Err(FetchError::InvalidUrl("URL must include a host".into())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn reject_blocked_ip(ip: IpAddr) -> Result<(), FetchError> {
|
||||||
|
if is_blocked_ip(ip) {
|
||||||
|
Err(FetchError::InvalidUrl(
|
||||||
|
"URL resolves to a blocked private or internal address".into(),
|
||||||
|
))
|
||||||
|
} else {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return true for IP ranges that should never be fetched server-side.
|
||||||
|
pub fn is_blocked_ip(ip: IpAddr) -> bool {
|
||||||
|
match ip {
|
||||||
|
IpAddr::V4(ip) => is_blocked_ipv4(ip),
|
||||||
|
IpAddr::V6(ip) => is_blocked_ipv6(ip),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_blocked_ipv4(ip: Ipv4Addr) -> bool {
|
||||||
|
let o = ip.octets();
|
||||||
|
|
||||||
|
ip.is_unspecified()
|
||||||
|
|| ip.is_loopback()
|
||||||
|
|| ip.is_private()
|
||||||
|
|| ip.is_link_local()
|
||||||
|
|| o[0] == 0
|
||||||
|
|| o[0] >= 224
|
||||||
|
|| (o[0] == 100 && (64..=127).contains(&o[1]))
|
||||||
|
|| (o[0] == 192 && o[1] == 0 && o[2] == 0)
|
||||||
|
|| (o[0] == 192 && o[1] == 0 && o[2] == 2)
|
||||||
|
|| (o[0] == 198 && (18..=19).contains(&o[1]))
|
||||||
|
|| (o[0] == 198 && o[1] == 51 && o[2] == 100)
|
||||||
|
|| (o[0] == 203 && o[1] == 0 && o[2] == 113)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_blocked_ipv6(ip: Ipv6Addr) -> bool {
|
||||||
|
let s = ip.segments();
|
||||||
|
|
||||||
|
ip.is_unspecified()
|
||||||
|
|| ip.is_loopback()
|
||||||
|
|| ip.is_multicast()
|
||||||
|
|| (s[0] & 0xfe00) == 0xfc00
|
||||||
|
|| (s[0] & 0xffc0) == 0xfe80
|
||||||
|
|| (s[0] == 0x0064 && s[1] == 0xff9b && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0)
|
||||||
|
|| (s[0] == 0x2001 && s[1] == 0x0db8)
|
||||||
|
|| embedded_ipv4(ip).is_some_and(is_blocked_ipv4)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn embedded_ipv4(ip: Ipv6Addr) -> Option<Ipv4Addr> {
|
||||||
|
let s = ip.segments();
|
||||||
|
|
||||||
|
if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0xffff {
|
||||||
|
return Some(Ipv4Addr::new(
|
||||||
|
(s[6] >> 8) as u8,
|
||||||
|
s[6] as u8,
|
||||||
|
(s[7] >> 8) as u8,
|
||||||
|
s[7] as u8,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0 {
|
||||||
|
return Some(Ipv4Addr::new(
|
||||||
|
(s[6] >> 8) as u8,
|
||||||
|
s[6] as u8,
|
||||||
|
(s[7] >> 8) as u8,
|
||||||
|
s[7] as u8,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
|
||||||
|
|
||||||
|
use super::{is_blocked_ip, validate_public_http_url};
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn blocks_ipv4_internal_ranges() {
|
||||||
|
for ip in [
|
||||||
|
Ipv4Addr::new(0, 0, 0, 0),
|
||||||
|
Ipv4Addr::new(10, 0, 0, 1),
|
||||||
|
Ipv4Addr::new(100, 64, 0, 1),
|
||||||
|
Ipv4Addr::new(127, 0, 0, 1),
|
||||||
|
Ipv4Addr::new(169, 254, 169, 254),
|
||||||
|
Ipv4Addr::new(172, 16, 0, 1),
|
||||||
|
Ipv4Addr::new(192, 168, 0, 1),
|
||||||
|
Ipv4Addr::new(192, 0, 0, 8),
|
||||||
|
Ipv4Addr::new(198, 18, 0, 1),
|
||||||
|
Ipv4Addr::new(255, 255, 255, 255),
|
||||||
|
] {
|
||||||
|
let url = format!("http://{ip}/");
|
||||||
|
assert!(validate_public_http_url(&url).await.is_err(), "{ip}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn blocks_ipv6_internal_ranges() {
|
||||||
|
for ip in [
|
||||||
|
Ipv6Addr::LOCALHOST,
|
||||||
|
Ipv6Addr::UNSPECIFIED,
|
||||||
|
"fc00::1".parse().unwrap(),
|
||||||
|
"fe80::1".parse().unwrap(),
|
||||||
|
"64:ff9b::7f00:1".parse().unwrap(),
|
||||||
|
"::ffff:127.0.0.1".parse().unwrap(),
|
||||||
|
] {
|
||||||
|
assert!(is_blocked_ip(IpAddr::V6(ip)), "{ip}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn allows_public_ip_literals() {
|
||||||
|
assert!(
|
||||||
|
validate_public_http_url("https://93.184.216.34/")
|
||||||
|
.await
|
||||||
|
.is_ok()
|
||||||
|
);
|
||||||
|
assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn blocks_localhost_domains_after_resolution() {
|
||||||
|
assert!(validate_public_http_url("http://localhost/").await.is_err());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -34,7 +34,7 @@ impl ProviderChain {
|
||||||
providers.push(Box::new(openai));
|
providers.push(Box::new(openai));
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(anthropic) = AnthropicProvider::new(None, None) {
|
if let Some(anthropic) = AnthropicProvider::with_base_url(None, None, None) {
|
||||||
debug!("anthropic configured, adding to chain");
|
debug!("anthropic configured, adding to chain");
|
||||||
providers.push(Box::new(anthropic));
|
providers.push(Box::new(anthropic));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -10,23 +10,38 @@ use crate::provider::{CompletionRequest, LlmProvider};
|
||||||
|
|
||||||
use super::load_api_key;
|
use super::load_api_key;
|
||||||
|
|
||||||
const ANTHROPIC_API_URL: &str = "https://api.anthropic.com/v1/messages";
|
const DEFAULT_ANTHROPIC_BASE_URL: &str = "https://api.anthropic.com/v1";
|
||||||
const ANTHROPIC_VERSION: &str = "2023-06-01";
|
const ANTHROPIC_VERSION: &str = "2023-06-01";
|
||||||
|
|
||||||
pub struct AnthropicProvider {
|
pub struct AnthropicProvider {
|
||||||
client: reqwest::Client,
|
client: reqwest::Client,
|
||||||
key: String,
|
key: String,
|
||||||
|
base_url: String,
|
||||||
default_model: String,
|
default_model: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AnthropicProvider {
|
impl AnthropicProvider {
|
||||||
/// Returns `None` if no API key is available (param or env).
|
/// Returns `None` if no API key is available (param or env).
|
||||||
pub fn new(key_override: Option<String>, model: Option<String>) -> Option<Self> {
|
pub fn new(key_override: Option<String>, model: Option<String>) -> Option<Self> {
|
||||||
|
Self::with_base_url(key_override, None, model)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `None` if no API key is available (param or env).
|
||||||
|
pub fn with_base_url(
|
||||||
|
key_override: Option<String>,
|
||||||
|
base_url: Option<String>,
|
||||||
|
model: Option<String>,
|
||||||
|
) -> Option<Self> {
|
||||||
let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?;
|
let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?;
|
||||||
|
|
||||||
Some(Self {
|
Some(Self {
|
||||||
client: reqwest::Client::new(),
|
client: reqwest::Client::new(),
|
||||||
key,
|
key,
|
||||||
|
base_url: base_url
|
||||||
|
.or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok())
|
||||||
|
.unwrap_or_else(|| DEFAULT_ANTHROPIC_BASE_URL.into())
|
||||||
|
.trim_end_matches('/')
|
||||||
|
.to_string(),
|
||||||
default_model: model.unwrap_or_else(|| "claude-sonnet-4-20250514".into()),
|
default_model: model.unwrap_or_else(|| "claude-sonnet-4-20250514".into()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
@ -34,6 +49,14 @@ impl AnthropicProvider {
|
||||||
pub fn default_model(&self) -> &str {
|
pub fn default_model(&self) -> &str {
|
||||||
&self.default_model
|
&self.default_model
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn messages_url(&self) -> String {
|
||||||
|
if self.base_url.ends_with("/messages") {
|
||||||
|
self.base_url.clone()
|
||||||
|
} else {
|
||||||
|
format!("{}/messages", self.base_url)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
|
|
@ -74,7 +97,7 @@ impl LlmProvider for AnthropicProvider {
|
||||||
|
|
||||||
let resp = self
|
let resp = self
|
||||||
.client
|
.client
|
||||||
.post(ANTHROPIC_API_URL)
|
.post(self.messages_url())
|
||||||
.header("x-api-key", &self.key)
|
.header("x-api-key", &self.key)
|
||||||
.header("anthropic-version", ANTHROPIC_VERSION)
|
.header("anthropic-version", ANTHROPIC_VERSION)
|
||||||
.header("content-type", "application/json")
|
.header("content-type", "application/json")
|
||||||
|
|
@ -135,6 +158,11 @@ mod tests {
|
||||||
assert_eq!(provider.name(), "anthropic");
|
assert_eq!(provider.name(), "anthropic");
|
||||||
assert_eq!(provider.default_model, "claude-sonnet-4-20250514");
|
assert_eq!(provider.default_model, "claude-sonnet-4-20250514");
|
||||||
assert_eq!(provider.key, "sk-ant-test");
|
assert_eq!(provider.key, "sk-ant-test");
|
||||||
|
assert_eq!(provider.base_url, "https://api.anthropic.com/v1");
|
||||||
|
assert_eq!(
|
||||||
|
provider.messages_url(),
|
||||||
|
"https://api.anthropic.com/v1/messages"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
@ -151,6 +179,35 @@ mod tests {
|
||||||
assert_eq!(provider.default_model(), "claude-sonnet-4-20250514");
|
assert_eq!(provider.default_model(), "claude-sonnet-4-20250514");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn custom_base_url_appends_messages_path() {
|
||||||
|
let provider = AnthropicProvider::with_base_url(
|
||||||
|
Some("sk-ant-test".into()),
|
||||||
|
Some("https://proxy.example.test/anthropic/v1/".into()),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(provider.base_url, "https://proxy.example.test/anthropic/v1");
|
||||||
|
assert_eq!(
|
||||||
|
provider.messages_url(),
|
||||||
|
"https://proxy.example.test/anthropic/v1/messages"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn custom_full_messages_url_is_not_doubled() {
|
||||||
|
let provider = AnthropicProvider::with_base_url(
|
||||||
|
Some("sk-ant-test".into()),
|
||||||
|
Some("https://proxy.example.test/v1/messages".into()),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
provider.messages_url(),
|
||||||
|
"https://proxy.example.test/v1/messages"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Env var fallback tests mutate process-global state and race with parallel tests.
|
// Env var fallback tests mutate process-global state and race with parallel tests.
|
||||||
// The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
|
// The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
|
||||||
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
|
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,50 @@ pub struct OpenAiProvider {
|
||||||
key: String,
|
key: String,
|
||||||
base_url: String,
|
base_url: String,
|
||||||
default_model: String,
|
default_model: String,
|
||||||
|
response_format: OpenAiResponseFormat,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
enum OpenAiResponseFormat {
|
||||||
|
JsonObject,
|
||||||
|
JsonSchema,
|
||||||
|
Text,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OpenAiResponseFormat {
|
||||||
|
fn from_env() -> Self {
|
||||||
|
std::env::var("OPENAI_RESPONSE_FORMAT_TYPE")
|
||||||
|
.ok()
|
||||||
|
.and_then(|value| Self::parse(&value))
|
||||||
|
.unwrap_or(Self::JsonObject)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse(value: &str) -> Option<Self> {
|
||||||
|
match value.trim().to_ascii_lowercase().as_str() {
|
||||||
|
"" | "json_object" => Some(Self::JsonObject),
|
||||||
|
"json_schema" => Some(Self::JsonSchema),
|
||||||
|
"text" => Some(Self::Text),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn as_response_format(self) -> serde_json::Value {
|
||||||
|
match self {
|
||||||
|
Self::JsonObject => json!({ "type": "json_object" }),
|
||||||
|
Self::JsonSchema => json!({
|
||||||
|
"type": "json_schema",
|
||||||
|
"json_schema": {
|
||||||
|
"name": "webclaw_response",
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": true
|
||||||
|
},
|
||||||
|
"strict": false
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
Self::Text => json!({ "type": "text" }),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl OpenAiProvider {
|
impl OpenAiProvider {
|
||||||
|
|
@ -31,23 +75,15 @@ impl OpenAiProvider {
|
||||||
.or_else(|| std::env::var("OPENAI_BASE_URL").ok())
|
.or_else(|| std::env::var("OPENAI_BASE_URL").ok())
|
||||||
.unwrap_or_else(|| "https://api.openai.com/v1".into()),
|
.unwrap_or_else(|| "https://api.openai.com/v1".into()),
|
||||||
default_model: model.unwrap_or_else(|| "gpt-4o-mini".into()),
|
default_model: model.unwrap_or_else(|| "gpt-4o-mini".into()),
|
||||||
|
response_format: OpenAiResponseFormat::from_env(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn default_model(&self) -> &str {
|
pub fn default_model(&self) -> &str {
|
||||||
&self.default_model
|
&self.default_model
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait]
|
|
||||||
impl LlmProvider for OpenAiProvider {
|
|
||||||
async fn complete(&self, request: &CompletionRequest) -> Result<String, LlmError> {
|
|
||||||
let model = if request.model.is_empty() {
|
|
||||||
&self.default_model
|
|
||||||
} else {
|
|
||||||
&request.model
|
|
||||||
};
|
|
||||||
|
|
||||||
|
fn request_body(&self, request: &CompletionRequest, model: &str) -> serde_json::Value {
|
||||||
let messages: Vec<serde_json::Value> = request
|
let messages: Vec<serde_json::Value> = request
|
||||||
.messages
|
.messages
|
||||||
.iter()
|
.iter()
|
||||||
|
|
@ -60,7 +96,7 @@ impl LlmProvider for OpenAiProvider {
|
||||||
});
|
});
|
||||||
|
|
||||||
if request.json_mode {
|
if request.json_mode {
|
||||||
body["response_format"] = json!({ "type": "json_object" });
|
body["response_format"] = self.response_format.as_response_format();
|
||||||
}
|
}
|
||||||
if let Some(temp) = request.temperature {
|
if let Some(temp) = request.temperature {
|
||||||
body["temperature"] = json!(temp);
|
body["temperature"] = json!(temp);
|
||||||
|
|
@ -69,6 +105,21 @@ impl LlmProvider for OpenAiProvider {
|
||||||
body["max_tokens"] = json!(max);
|
body["max_tokens"] = json!(max);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
body
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl LlmProvider for OpenAiProvider {
|
||||||
|
async fn complete(&self, request: &CompletionRequest) -> Result<String, LlmError> {
|
||||||
|
let model = if request.model.is_empty() {
|
||||||
|
&self.default_model
|
||||||
|
} else {
|
||||||
|
&request.model
|
||||||
|
};
|
||||||
|
|
||||||
|
let body = self.request_body(request, model);
|
||||||
|
|
||||||
let url = format!("{}/chat/completions", self.base_url);
|
let url = format!("{}/chat/completions", self.base_url);
|
||||||
let resp = self
|
let resp = self
|
||||||
.client
|
.client
|
||||||
|
|
@ -136,6 +187,7 @@ mod tests {
|
||||||
assert_eq!(provider.default_model, "gpt-4o-mini");
|
assert_eq!(provider.default_model, "gpt-4o-mini");
|
||||||
assert_eq!(provider.base_url, "https://api.openai.com/v1");
|
assert_eq!(provider.base_url, "https://api.openai.com/v1");
|
||||||
assert_eq!(provider.key, "test-key-123");
|
assert_eq!(provider.key, "test-key-123");
|
||||||
|
assert_eq!(provider.response_format, OpenAiResponseFormat::JsonObject);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
@ -161,6 +213,69 @@ mod tests {
|
||||||
assert_eq!(provider.default_model(), "gpt-4o-mini");
|
assert_eq!(provider.default_model(), "gpt-4o-mini");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn json_mode_defaults_to_openai_json_object() {
|
||||||
|
let provider = OpenAiProvider::new(
|
||||||
|
Some("test-key".into()),
|
||||||
|
Some("https://api.openai.com/v1".into()),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let req = CompletionRequest {
|
||||||
|
model: String::new(),
|
||||||
|
messages: vec![],
|
||||||
|
temperature: None,
|
||||||
|
max_tokens: None,
|
||||||
|
json_mode: true,
|
||||||
|
};
|
||||||
|
let body = provider.request_body(&req, provider.default_model());
|
||||||
|
assert_eq!(body["response_format"], json!({ "type": "json_object" }));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn json_schema_response_format_for_compatible_backends() {
|
||||||
|
let mut provider = OpenAiProvider::new(
|
||||||
|
Some("test-key".into()),
|
||||||
|
Some("http://localhost:1234/v1".into()),
|
||||||
|
Some("local-model".into()),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
provider.response_format = OpenAiResponseFormat::JsonSchema;
|
||||||
|
let req = CompletionRequest {
|
||||||
|
model: String::new(),
|
||||||
|
messages: vec![],
|
||||||
|
temperature: None,
|
||||||
|
max_tokens: None,
|
||||||
|
json_mode: true,
|
||||||
|
};
|
||||||
|
let body = provider.request_body(&req, provider.default_model());
|
||||||
|
assert_eq!(body["response_format"]["type"], "json_schema");
|
||||||
|
assert_eq!(
|
||||||
|
body["response_format"]["json_schema"]["schema"]["type"],
|
||||||
|
"object"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn text_response_format_for_lm_studio() {
|
||||||
|
let mut provider = OpenAiProvider::new(
|
||||||
|
Some("test-key".into()),
|
||||||
|
Some("http://localhost:1234/v1".into()),
|
||||||
|
Some("local-model".into()),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
provider.response_format = OpenAiResponseFormat::Text;
|
||||||
|
let req = CompletionRequest {
|
||||||
|
model: String::new(),
|
||||||
|
messages: vec![],
|
||||||
|
temperature: None,
|
||||||
|
max_tokens: None,
|
||||||
|
json_mode: true,
|
||||||
|
};
|
||||||
|
let body = provider.request_body(&req, provider.default_model());
|
||||||
|
assert_eq!(body["response_format"], json!({ "type": "text" }));
|
||||||
|
}
|
||||||
|
|
||||||
// Env var fallback tests mutate process-global state and race with parallel tests.
|
// Env var fallback tests mutate process-global state and race with parallel tests.
|
||||||
// The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
|
// The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
|
||||||
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
|
// cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,6 @@ use rmcp::model::{Implementation, ServerCapabilities, ServerInfo};
|
||||||
use rmcp::{ServerHandler, tool, tool_handler, tool_router};
|
use rmcp::{ServerHandler, tool, tool_handler, tool_router};
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
use tracing::{error, info, warn};
|
use tracing::{error, info, warn};
|
||||||
use url::Url;
|
|
||||||
|
|
||||||
use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult};
|
use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult};
|
||||||
|
|
||||||
|
|
@ -52,21 +51,12 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Validate that a URL is non-empty and has an http or https scheme.
|
/// Validate that a URL is public HTTP(S), matching the fetch-layer SSRF guard.
|
||||||
fn validate_url(url: &str) -> Result<(), String> {
|
async fn validate_url(url: &str) -> Result<(), String> {
|
||||||
if url.is_empty() {
|
webclaw_fetch::url_security::validate_public_http_url(url)
|
||||||
return Err("Invalid URL: must not be empty".into());
|
.await
|
||||||
}
|
.map(|_| ())
|
||||||
match Url::parse(url) {
|
.map_err(|e| format!("Invalid URL: {e}"))
|
||||||
Ok(parsed) if parsed.scheme() == "http" || parsed.scheme() == "https" => Ok(()),
|
|
||||||
Ok(parsed) => Err(format!(
|
|
||||||
"Invalid URL: scheme '{}' not allowed, must start with http:// or https://",
|
|
||||||
parsed.scheme()
|
|
||||||
)),
|
|
||||||
Err(e) => Err(format!(
|
|
||||||
"Invalid URL: {e}. Must start with http:// or https://"
|
|
||||||
)),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Timeout for local fetch calls (prevents hanging on tarpitting servers).
|
/// Timeout for local fetch calls (prevents hanging on tarpitting servers).
|
||||||
|
|
@ -172,7 +162,7 @@ impl WebclawMcp {
|
||||||
/// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected.
|
/// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected.
|
||||||
#[tool]
|
#[tool]
|
||||||
async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> {
|
async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> {
|
||||||
validate_url(¶ms.url)?;
|
validate_url(¶ms.url).await?;
|
||||||
let format = params.format.as_deref().unwrap_or("markdown");
|
let format = params.format.as_deref().unwrap_or("markdown");
|
||||||
let browser = parse_browser(params.browser.as_deref());
|
let browser = parse_browser(params.browser.as_deref());
|
||||||
let include = params.include_selectors.unwrap_or_default();
|
let include = params.include_selectors.unwrap_or_default();
|
||||||
|
|
@ -262,7 +252,7 @@ impl WebclawMcp {
|
||||||
/// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit.
|
/// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit.
|
||||||
#[tool]
|
#[tool]
|
||||||
async fn crawl(&self, Parameters(params): Parameters<CrawlParams>) -> Result<String, String> {
|
async fn crawl(&self, Parameters(params): Parameters<CrawlParams>) -> Result<String, String> {
|
||||||
validate_url(¶ms.url)?;
|
validate_url(¶ms.url).await?;
|
||||||
|
|
||||||
if let Some(max) = params.max_pages
|
if let Some(max) = params.max_pages
|
||||||
&& max > 500
|
&& max > 500
|
||||||
|
|
@ -311,7 +301,7 @@ impl WebclawMcp {
|
||||||
/// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml).
|
/// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml).
|
||||||
#[tool]
|
#[tool]
|
||||||
async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> {
|
async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> {
|
||||||
validate_url(¶ms.url)?;
|
validate_url(¶ms.url).await?;
|
||||||
let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, ¶ms.url)
|
let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, ¶ms.url)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| format!("Sitemap discovery failed: {e}"))?;
|
.map_err(|e| format!("Sitemap discovery failed: {e}"))?;
|
||||||
|
|
@ -334,7 +324,7 @@ impl WebclawMcp {
|
||||||
return Err("batch is limited to 100 URLs per request".into());
|
return Err("batch is limited to 100 URLs per request".into());
|
||||||
}
|
}
|
||||||
for u in ¶ms.urls {
|
for u in ¶ms.urls {
|
||||||
validate_url(u)?;
|
validate_url(u).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let format = params.format.as_deref().unwrap_or("markdown");
|
let format = params.format.as_deref().unwrap_or("markdown");
|
||||||
|
|
@ -376,7 +366,7 @@ impl WebclawMcp {
|
||||||
&self,
|
&self,
|
||||||
Parameters(params): Parameters<ExtractParams>,
|
Parameters(params): Parameters<ExtractParams>,
|
||||||
) -> Result<String, String> {
|
) -> Result<String, String> {
|
||||||
validate_url(¶ms.url)?;
|
validate_url(¶ms.url).await?;
|
||||||
|
|
||||||
if params.schema.is_none() && params.prompt.is_none() {
|
if params.schema.is_none() && params.prompt.is_none() {
|
||||||
return Err("Either 'schema' or 'prompt' is required for extraction.".into());
|
return Err("Either 'schema' or 'prompt' is required for extraction.".into());
|
||||||
|
|
@ -433,7 +423,7 @@ impl WebclawMcp {
|
||||||
&self,
|
&self,
|
||||||
Parameters(params): Parameters<SummarizeParams>,
|
Parameters(params): Parameters<SummarizeParams>,
|
||||||
) -> Result<String, String> {
|
) -> Result<String, String> {
|
||||||
validate_url(¶ms.url)?;
|
validate_url(¶ms.url).await?;
|
||||||
|
|
||||||
// No local LLM — fall back to cloud API directly
|
// No local LLM — fall back to cloud API directly
|
||||||
if self.llm_chain.is_none() {
|
if self.llm_chain.is_none() {
|
||||||
|
|
@ -475,7 +465,7 @@ impl WebclawMcp {
|
||||||
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
|
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
|
||||||
#[tool]
|
#[tool]
|
||||||
async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> {
|
async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> {
|
||||||
validate_url(¶ms.url)?;
|
validate_url(¶ms.url).await?;
|
||||||
let previous: webclaw_core::ExtractionResult =
|
let previous: webclaw_core::ExtractionResult =
|
||||||
serde_json::from_str(¶ms.previous_snapshot)
|
serde_json::from_str(¶ms.previous_snapshot)
|
||||||
.map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
|
.map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
|
||||||
|
|
@ -543,7 +533,7 @@ impl WebclawMcp {
|
||||||
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
|
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
|
||||||
#[tool]
|
#[tool]
|
||||||
async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> {
|
async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> {
|
||||||
validate_url(¶ms.url)?;
|
validate_url(¶ms.url).await?;
|
||||||
let fetch_result =
|
let fetch_result =
|
||||||
tokio::time::timeout(LOCAL_FETCH_TIMEOUT, self.fetch_client.fetch(¶ms.url))
|
tokio::time::timeout(LOCAL_FETCH_TIMEOUT, self.fetch_client.fetch(¶ms.url))
|
||||||
.await
|
.await
|
||||||
|
|
@ -718,6 +708,55 @@ impl WebclawMcp {
|
||||||
Ok(serde_json::to_string_pretty(&resp).unwrap_or_default())
|
Ok(serde_json::to_string_pretty(&resp).unwrap_or_default())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// List every vertical extractor the server knows about. Returns a
|
||||||
|
/// JSON array of `{name, label, description, url_patterns}` entries.
|
||||||
|
/// Call this to discover what verticals are available before using
|
||||||
|
/// `vertical_scrape`.
|
||||||
|
#[tool]
|
||||||
|
async fn list_extractors(
|
||||||
|
&self,
|
||||||
|
Parameters(_params): Parameters<ListExtractorsParams>,
|
||||||
|
) -> Result<String, String> {
|
||||||
|
let catalog = webclaw_fetch::extractors::list();
|
||||||
|
serde_json::to_string_pretty(&catalog)
|
||||||
|
.map_err(|e| format!("failed to serialise extractor catalog: {e}"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run a vertical extractor by name and return typed JSON specific
|
||||||
|
/// to the target site (title, price, rating, author, etc.), not
|
||||||
|
/// generic markdown. Use `list_extractors` to discover available
|
||||||
|
/// names. Example names: `reddit`, `github_repo`, `trustpilot_reviews`,
|
||||||
|
/// `youtube_video`, `shopify_product`, `pypi`, `npm`, `arxiv`.
|
||||||
|
///
|
||||||
|
/// Antibot-gated verticals (amazon_product, ebay_listing,
|
||||||
|
/// etsy_listing, trustpilot_reviews) will automatically escalate to
|
||||||
|
/// the webclaw cloud API when local fetch hits bot protection,
|
||||||
|
/// provided `WEBCLAW_API_KEY` is set.
|
||||||
|
#[tool]
|
||||||
|
async fn vertical_scrape(
|
||||||
|
&self,
|
||||||
|
Parameters(params): Parameters<VerticalParams>,
|
||||||
|
) -> Result<String, String> {
|
||||||
|
validate_url(¶ms.url).await?;
|
||||||
|
// Use the cached Firefox client, not the default Chrome one.
|
||||||
|
// Reddit's `.json` endpoint rejects the wreq-Chrome TLS
|
||||||
|
// fingerprint with a 403 even from residential IPs (they
|
||||||
|
// ship a fingerprint blocklist that includes common
|
||||||
|
// browser-emulation libraries). The wreq-Firefox fingerprint
|
||||||
|
// still passes, and Firefox is equally fine for every other
|
||||||
|
// vertical in the catalog, so it's a strictly-safer default
|
||||||
|
// for `vertical_scrape` than the generic `scrape` tool's
|
||||||
|
// Chrome default. Matches the CLI `webclaw vertical`
|
||||||
|
// subcommand which already uses Firefox.
|
||||||
|
let client = self.firefox_or_build()?;
|
||||||
|
let data =
|
||||||
|
webclaw_fetch::extractors::dispatch_by_name(client.as_ref(), ¶ms.name, ¶ms.url)
|
||||||
|
.await
|
||||||
|
.map_err(|e| e.to_string())?;
|
||||||
|
serde_json::to_string_pretty(&data)
|
||||||
|
.map_err(|e| format!("failed to serialise extractor output: {e}"))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tool_handler]
|
#[tool_handler]
|
||||||
|
|
@ -727,7 +766,8 @@ impl ServerHandler for WebclawMcp {
|
||||||
.with_server_info(Implementation::new("webclaw-mcp", env!("CARGO_PKG_VERSION")))
|
.with_server_info(Implementation::new("webclaw-mcp", env!("CARGO_PKG_VERSION")))
|
||||||
.with_instructions(String::from(
|
.with_instructions(String::from(
|
||||||
"Webclaw MCP server -- web content extraction for AI agents. \
|
"Webclaw MCP server -- web content extraction for AI agents. \
|
||||||
Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.",
|
Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search, \
|
||||||
|
list_extractors, vertical_scrape.",
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -760,7 +800,9 @@ fn slugify(query: &str) -> String {
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
.join("-")
|
.join("-")
|
||||||
.to_lowercase();
|
.to_lowercase();
|
||||||
if s.len() > 60 { s[..60].to_string() } else { s }
|
// char-safe truncation: byte slicing panics if char 60 lands
|
||||||
|
// mid-codepoint (multibyte queries, e.g. CJK / accented input).
|
||||||
|
s.chars().take(60).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check for a cached research result. Returns the compact response if found.
|
/// Check for a cached research result. Returns the compact response if found.
|
||||||
|
|
@ -816,3 +858,32 @@ fn save_research(dir: &std::path::Path, slug: &str, data: &serde_json::Value) ->
|
||||||
json_path.to_string_lossy().to_string(),
|
json_path.to_string_lossy().to_string(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::slugify;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn slugify_multibyte_query_does_not_panic() {
|
||||||
|
// Byte-slicing s[..60] would panic mid-codepoint on multibyte
|
||||||
|
// alphanumerics; char-safe truncation must not.
|
||||||
|
let q = "日本語のクエリ".repeat(20); // long, 3-byte chars
|
||||||
|
let s = slugify(&q);
|
||||||
|
assert!(
|
||||||
|
s.chars().count() <= 60,
|
||||||
|
"slug too long: {}",
|
||||||
|
s.chars().count()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn slugify_ascii_unchanged_under_limit() {
|
||||||
|
assert_eq!(slugify("Hello World Query"), "hello-world-query");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn slugify_caps_long_ascii_at_60_chars() {
|
||||||
|
let s = slugify(&"word ".repeat(40));
|
||||||
|
assert!(s.len() <= 60);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -103,3 +103,20 @@ pub struct SearchParams {
|
||||||
/// Number of results to return (default: 10)
|
/// Number of results to return (default: 10)
|
||||||
pub num_results: Option<u32>,
|
pub num_results: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Parameters for `vertical_scrape`: run a site-specific extractor by name.
|
||||||
|
#[derive(Debug, Deserialize, JsonSchema)]
|
||||||
|
pub struct VerticalParams {
|
||||||
|
/// Name of the vertical extractor. Call `list_extractors` to see all
|
||||||
|
/// available names. Examples: "reddit", "github_repo", "pypi",
|
||||||
|
/// "trustpilot_reviews", "youtube_video", "shopify_product".
|
||||||
|
pub name: String,
|
||||||
|
/// URL to extract. Must match the URL patterns the extractor claims;
|
||||||
|
/// otherwise the tool returns a clear "URL mismatch" error.
|
||||||
|
pub url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `list_extractors` takes no arguments but we still need an empty struct
|
||||||
|
/// so rmcp can generate a schema and parse the (empty) JSON-RPC params.
|
||||||
|
#[derive(Debug, Deserialize, JsonSchema)]
|
||||||
|
pub struct ListExtractorsParams {}
|
||||||
|
|
|
||||||
|
|
@ -70,7 +70,21 @@ impl IntoResponse for ApiError {
|
||||||
|
|
||||||
impl From<webclaw_fetch::FetchError> for ApiError {
|
impl From<webclaw_fetch::FetchError> for ApiError {
|
||||||
fn from(e: webclaw_fetch::FetchError) -> Self {
|
fn from(e: webclaw_fetch::FetchError) -> Self {
|
||||||
Self::Fetch(e.to_string())
|
match e {
|
||||||
|
webclaw_fetch::FetchError::InvalidUrl(msg) => {
|
||||||
|
Self::BadRequest(format!("invalid url: {msg}"))
|
||||||
|
}
|
||||||
|
other => {
|
||||||
|
let msg = other.to_string();
|
||||||
|
if msg.contains("invalid url:")
|
||||||
|
|| msg.contains("blocked private or internal address")
|
||||||
|
{
|
||||||
|
Self::BadRequest(msg)
|
||||||
|
} else {
|
||||||
|
Self::Fetch(msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -75,6 +75,15 @@ async fn main() -> anyhow::Result<()> {
|
||||||
.compact()
|
.compact()
|
||||||
.init();
|
.init();
|
||||||
|
|
||||||
|
if is_unspecified_addr(args.host)
|
||||||
|
&& args.api_key.is_none()
|
||||||
|
&& std::env::var_os("WEBCLAW_ALLOW_OPEN_PUBLIC").is_none()
|
||||||
|
{
|
||||||
|
anyhow::bail!(
|
||||||
|
"refusing to bind 0.0.0.0/[::] without WEBCLAW_API_KEY; set WEBCLAW_API_KEY or WEBCLAW_ALLOW_OPEN_PUBLIC=1 to override"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
let state = AppState::new(args.api_key.clone())?;
|
let state = AppState::new(args.api_key.clone())?;
|
||||||
|
|
||||||
let v1 = Router::new()
|
let v1 = Router::new()
|
||||||
|
|
@ -121,3 +130,10 @@ async fn main() -> anyhow::Result<()> {
|
||||||
axum::serve(listener, app).await?;
|
axum::serve(listener, app).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_unspecified_addr(addr: IpAddr) -> bool {
|
||||||
|
match addr {
|
||||||
|
IpAddr::V4(ip) => ip.is_unspecified(),
|
||||||
|
IpAddr::V6(ip) => ip.is_unspecified(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,14 @@ pub async fn batch(
|
||||||
req.urls.len()
|
req.urls.len()
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
let mut safe_urls = Vec::with_capacity(req.urls.len());
|
||||||
|
for url in &req.urls {
|
||||||
|
safe_urls.push(
|
||||||
|
webclaw_fetch::url_security::validate_public_http_url(url)
|
||||||
|
.await?
|
||||||
|
.to_string(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
let concurrency = req.concurrency.unwrap_or(5).clamp(1, HARD_MAX_CONCURRENCY);
|
let concurrency = req.concurrency.unwrap_or(5).clamp(1, HARD_MAX_CONCURRENCY);
|
||||||
|
|
||||||
|
|
@ -47,7 +55,7 @@ pub async fn batch(
|
||||||
include_raw_html: false,
|
include_raw_html: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
let url_refs: Vec<&str> = req.urls.iter().map(|s| s.as_str()).collect();
|
let url_refs: Vec<&str> = safe_urls.iter().map(|s| s.as_str()).collect();
|
||||||
let results = state
|
let results = state
|
||||||
.fetch()
|
.fetch()
|
||||||
.fetch_and_extract_batch_with_options(&url_refs, concurrency, &options)
|
.fetch_and_extract_batch_with_options(&url_refs, concurrency, &options)
|
||||||
|
|
|
||||||
|
|
@ -21,8 +21,9 @@ pub async fn brand(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
|
|
||||||
let fetched = state.fetch().fetch(&req.url).await?;
|
let fetched = state.fetch().fetch(url.as_str()).await?;
|
||||||
let brand = extract_brand(&fetched.html, Some(&fetched.url));
|
let brand = extract_brand(&fetched.html, Some(&fetched.url));
|
||||||
|
|
||||||
Ok(Json(json!({
|
Ok(Json(json!({
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,7 @@ pub async fn crawl(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
let max_pages = req.max_pages.unwrap_or(50).min(HARD_MAX_PAGES);
|
let max_pages = req.max_pages.unwrap_or(50).min(HARD_MAX_PAGES);
|
||||||
let max_depth = req.max_depth.unwrap_or(3);
|
let max_depth = req.max_depth.unwrap_or(3);
|
||||||
let concurrency = req.concurrency.unwrap_or(5).min(20);
|
let concurrency = req.concurrency.unwrap_or(5).min(20);
|
||||||
|
|
@ -56,8 +57,8 @@ pub async fn crawl(
|
||||||
cancel_flag: None,
|
cancel_flag: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let crawler = Crawler::new(&req.url, config).map_err(ApiError::from)?;
|
let crawler = Crawler::new(url.as_str(), config).map_err(ApiError::from)?;
|
||||||
let result = crawler.crawl(&req.url, None).await;
|
let result = crawler.crawl(url.as_str(), None).await;
|
||||||
|
|
||||||
let pages: Vec<Value> = result
|
let pages: Vec<Value> = result
|
||||||
.pages
|
.pages
|
||||||
|
|
|
||||||
|
|
@ -75,8 +75,9 @@ pub async fn diff_route(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
|
|
||||||
let current = state.fetch().fetch_and_extract(&req.url).await?;
|
let current = state.fetch().fetch_and_extract(url.as_str()).await?;
|
||||||
let previous = req.previous.into_extraction();
|
let previous = req.previous.into_extraction();
|
||||||
let result = diff(&previous, ¤t);
|
let result = diff(&previous, ¤t);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,10 +43,11 @@ pub async fn extract(
|
||||||
"either `schema` or `prompt` is required",
|
"either `schema` or `prompt` is required",
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
|
|
||||||
// Fetch + extract first so we feed the LLM clean markdown instead of
|
// Fetch + extract first so we feed the LLM clean markdown instead of
|
||||||
// raw HTML. Cheaper tokens, better signal.
|
// raw HTML. Cheaper tokens, better signal.
|
||||||
let extraction = state.fetch().fetch_and_extract(&req.url).await?;
|
let extraction = state.fetch().fetch_and_extract(url.as_str()).await?;
|
||||||
let content = if extraction.content.markdown.trim().is_empty() {
|
let content = if extraction.content.markdown.trim().is_empty() {
|
||||||
extraction.content.plain_text.clone()
|
extraction.content.plain_text.clone()
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -27,8 +27,9 @@ pub async fn map(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
|
|
||||||
let entries = sitemap::discover(state.fetch(), &req.url).await?;
|
let entries = sitemap::discover(state.fetch(), url.as_str()).await?;
|
||||||
|
|
||||||
let body = if req.include_metadata {
|
let body = if req.include_metadata {
|
||||||
json!({
|
json!({
|
||||||
|
|
|
||||||
|
|
@ -52,6 +52,7 @@ pub async fn scrape(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
let formats = req.formats.as_vec();
|
let formats = req.formats.as_vec();
|
||||||
|
|
||||||
let options = ExtractionOptions {
|
let options = ExtractionOptions {
|
||||||
|
|
@ -63,11 +64,11 @@ pub async fn scrape(
|
||||||
|
|
||||||
let extraction = state
|
let extraction = state
|
||||||
.fetch()
|
.fetch()
|
||||||
.fetch_and_extract_with_options(&req.url, &options)
|
.fetch_and_extract_with_options(url.as_str(), &options)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let mut body = json!({
|
let mut body = json!({
|
||||||
"url": extraction.metadata.url.clone().unwrap_or_else(|| req.url.clone()),
|
"url": extraction.metadata.url.clone().unwrap_or_else(|| url.to_string()),
|
||||||
"metadata": extraction.metadata,
|
"metadata": extraction.metadata,
|
||||||
});
|
});
|
||||||
let obj = body.as_object_mut().expect("json::object");
|
let obj = body.as_object_mut().expect("json::object");
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@ impl From<ExtractorDispatchError> for ApiError {
|
||||||
match e {
|
match e {
|
||||||
ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
|
ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
|
||||||
ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
|
ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
|
||||||
ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()),
|
ExtractorDispatchError::Fetch(f) => ApiError::from(f),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -46,7 +46,8 @@ pub async fn scrape_vertical(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?;
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
|
let data = extractors::dispatch_by_name(state.fetch(), &vertical, url.as_str()).await?;
|
||||||
Ok(Json(json!({
|
Ok(Json(json!({
|
||||||
"vertical": vertical,
|
"vertical": vertical,
|
||||||
"url": req.url,
|
"url": req.url,
|
||||||
|
|
|
||||||
|
|
@ -22,8 +22,9 @@ pub async fn summarize_route(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
|
|
||||||
let extraction = state.fetch().fetch_and_extract(&req.url).await?;
|
let extraction = state.fetch().fetch_and_extract(url.as_str()).await?;
|
||||||
let content = if extraction.content.markdown.trim().is_empty() {
|
let content = if extraction.content.markdown.trim().is_empty() {
|
||||||
extraction.content.plain_text.clone()
|
extraction.content.plain_text.clone()
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,19 @@ warn() { printf "${YELLOW}[!]${RESET} %s\n" "$*"; }
|
||||||
error() { printf "${RED}[x]${RESET} %s\n" "$*" >&2; }
|
error() { printf "${RED}[x]${RESET} %s\n" "$*" >&2; }
|
||||||
fatal() { error "$*"; exit 1; }
|
fatal() { error "$*"; exit 1; }
|
||||||
|
|
||||||
|
# Mask a secret for display: keep the last 4 chars, redact the rest.
|
||||||
|
# Empty input renders as "(not set)".
|
||||||
|
mask_secret() {
|
||||||
|
local s="$1"
|
||||||
|
if [[ -z "$s" ]]; then
|
||||||
|
printf '(not set)'
|
||||||
|
elif (( ${#s} <= 4 )); then
|
||||||
|
printf '****'
|
||||||
|
else
|
||||||
|
printf '****%s' "${s: -4}"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
prompt() {
|
prompt() {
|
||||||
local var_name="$1" prompt_text="$2" default="${3:-}"
|
local var_name="$1" prompt_text="$2" default="${3:-}"
|
||||||
if [[ -n "$default" ]]; then
|
if [[ -n "$default" ]]; then
|
||||||
|
|
@ -52,7 +65,7 @@ prompt() {
|
||||||
printf "${CYAN} %s${RESET}: " "$prompt_text"
|
printf "${CYAN} %s${RESET}: " "$prompt_text"
|
||||||
fi
|
fi
|
||||||
read -r input
|
read -r input
|
||||||
eval "$var_name=\"${input:-$default}\""
|
printf -v "$var_name" '%s' "${input:-$default}"
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt_secret() {
|
prompt_secret() {
|
||||||
|
|
@ -64,7 +77,7 @@ prompt_secret() {
|
||||||
fi
|
fi
|
||||||
read -rs input
|
read -rs input
|
||||||
echo
|
echo
|
||||||
eval "$var_name=\"${input:-$default}\""
|
printf -v "$var_name" '%s' "${input:-$default}"
|
||||||
}
|
}
|
||||||
|
|
||||||
generate_key() {
|
generate_key() {
|
||||||
|
|
@ -374,7 +387,7 @@ create_server() {
|
||||||
printf " Domain: ${BOLD}%s${RESET}\n" "${DOMAIN:-none}"
|
printf " Domain: ${BOLD}%s${RESET}\n" "${DOMAIN:-none}"
|
||||||
printf " OpenAI key: ${BOLD}%s${RESET}\n" "$([ -n "$OPENAI_KEY" ] && echo 'set' || echo 'not set')"
|
printf " OpenAI key: ${BOLD}%s${RESET}\n" "$([ -n "$OPENAI_KEY" ] && echo 'set' || echo 'not set')"
|
||||||
printf " Anthropic key:${BOLD}%s${RESET}\n" "$([ -n "$ANTHROPIC_KEY" ] && echo 'set' || echo 'not set')"
|
printf " Anthropic key:${BOLD}%s${RESET}\n" "$([ -n "$ANTHROPIC_KEY" ] && echo 'set' || echo 'not set')"
|
||||||
printf " Auth key: ${BOLD}%s${RESET}\n" "$AUTH_KEY"
|
printf " Auth key: ${BOLD}%s${RESET}\n" "$(mask_secret "$AUTH_KEY")"
|
||||||
printf " Ollama model: ${BOLD}%s${RESET}\n" "$OLLAMA_MODEL"
|
printf " Ollama model: ${BOLD}%s${RESET}\n" "$OLLAMA_MODEL"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
|
|
@ -454,7 +467,9 @@ create_server() {
|
||||||
echo
|
echo
|
||||||
printf " ${BOLD}Server IP:${RESET} %s\n" "$server_ip"
|
printf " ${BOLD}Server IP:${RESET} %s\n" "$server_ip"
|
||||||
printf " ${BOLD}SSH:${RESET} ssh root@%s\n" "$server_ip"
|
printf " ${BOLD}SSH:${RESET} ssh root@%s\n" "$server_ip"
|
||||||
printf " ${BOLD}Auth key:${RESET} %s\n" "$AUTH_KEY"
|
printf " ${BOLD}Auth key:${RESET} %s\n" "$(mask_secret "$AUTH_KEY")"
|
||||||
|
printf " ${DIM}(full key stored in /opt/webclaw/.env on the server:\n"
|
||||||
|
printf " ssh root@%s 'grep WEBCLAW_AUTH_KEY /opt/webclaw/.env')${RESET}\n" "$server_ip"
|
||||||
echo
|
echo
|
||||||
printf " ${BOLD}Monitor build progress:${RESET}\n"
|
printf " ${BOLD}Monitor build progress:${RESET}\n"
|
||||||
printf " ssh root@%s 'cd /opt/webclaw && docker compose logs -f'\n" "$server_ip"
|
printf " ssh root@%s 'cd /opt/webclaw && docker compose logs -f'\n" "$server_ip"
|
||||||
|
|
@ -465,7 +480,7 @@ create_server() {
|
||||||
printf " ${BOLD}Scrape:${RESET}\n"
|
printf " ${BOLD}Scrape:${RESET}\n"
|
||||||
printf " curl -X POST http://%s:3000/v1/scrape \\\\\n" "$server_ip"
|
printf " curl -X POST http://%s:3000/v1/scrape \\\\\n" "$server_ip"
|
||||||
printf " -H 'Content-Type: application/json' \\\\\n"
|
printf " -H 'Content-Type: application/json' \\\\\n"
|
||||||
printf " -H 'Authorization: Bearer %s' \\\\\n" "$AUTH_KEY"
|
printf " -H 'Authorization: Bearer <YOUR_AUTH_KEY>' \\\\\n"
|
||||||
printf " -d '{\"url\": \"https://example.com\"}'\n"
|
printf " -d '{\"url\": \"https://example.com\"}'\n"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
|
|
@ -482,7 +497,8 @@ create_server() {
|
||||||
echo
|
echo
|
||||||
|
|
||||||
printf " ${BOLD}Tear down:${RESET}\n"
|
printf " ${BOLD}Tear down:${RESET}\n"
|
||||||
printf " HETZNER_TOKEN=%s ./deploy/hetzner.sh --destroy\n" "$HETZNER_TOKEN"
|
printf " HETZNER_TOKEN=\$HETZNER_TOKEN ./deploy/hetzner.sh --destroy\n"
|
||||||
|
printf " ${DIM}(re-export the same token you used to deploy)${RESET}\n"
|
||||||
echo
|
echo
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,14 @@
|
||||||
|
|
||||||
Practical examples showing what webclaw can do. Each example is a self-contained command you can run immediately.
|
Practical examples showing what webclaw can do. Each example is a self-contained command you can run immediately.
|
||||||
|
|
||||||
|
## Workflow Guides
|
||||||
|
|
||||||
|
- [HTML to Markdown for RAG](./html-to-markdown-rag/) turns web pages into markdown or compact LLM text for retrieval pipelines.
|
||||||
|
- [Firecrawl-Compatible API](./firecrawl-compatible-api/) shows the `/v2` compatibility routes for scrape, crawl, map, and search.
|
||||||
|
- [MCP Web Scraping](./mcp-web-scraping/) connects webclaw to MCP clients such as Claude Code, Claude Desktop, Cursor, and Codex CLI.
|
||||||
|
- [Proxy-Backed Crawling](./proxy-backed-crawling/) shows single-proxy and proxy-pool crawling from the CLI.
|
||||||
|
- [Cloudflare Diagnostics](./cloudflare-diagnostics/) gives a reproducible checklist for blocked or empty protected-site results.
|
||||||
|
|
||||||
## Basic Extraction
|
## Basic Extraction
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
|
||||||
58
examples/cloudflare-diagnostics/README.md
Normal file
58
examples/cloudflare-diagnostics/README.md
Normal file
|
|
@ -0,0 +1,58 @@
|
||||||
|
# Cloudflare Diagnostics
|
||||||
|
|
||||||
|
Use this checklist when a page works in the browser but fails from a scraper, returns a challenge page, or produces empty extracted content.
|
||||||
|
|
||||||
|
## 1. Save the Raw Response
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://protected.example.com --raw-html > raw.html
|
||||||
|
```
|
||||||
|
|
||||||
|
Inspect `raw.html` for challenge copy, blocked request text, empty shells, or application HTML that needs JavaScript rendering.
|
||||||
|
|
||||||
|
## 2. Compare Extracted Formats
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://protected.example.com --format markdown > page.md
|
||||||
|
webclaw https://protected.example.com --format json > page.json
|
||||||
|
webclaw https://protected.example.com --format llm > page.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
If raw HTML has content but markdown is empty, tune extraction with selectors:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://protected.example.com \
|
||||||
|
--include "main, article, [role=main]" \
|
||||||
|
--exclude "nav, footer, aside, .cookie-banner" \
|
||||||
|
--format markdown
|
||||||
|
```
|
||||||
|
|
||||||
|
## 3. Try Another Browser Fingerprint
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://protected.example.com --browser firefox --format markdown
|
||||||
|
webclaw https://protected.example.com --browser random --format markdown
|
||||||
|
```
|
||||||
|
|
||||||
|
## 4. Use Cloud Fallback
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export WEBCLAW_API_KEY=wc_your_key
|
||||||
|
|
||||||
|
webclaw https://protected.example.com --cloud --format markdown
|
||||||
|
```
|
||||||
|
|
||||||
|
Cloud mode can use hosted routing, JS rendering, and protected-site handling that are not part of the fully local open-source path.
|
||||||
|
|
||||||
|
## 5. Keep a Reproducible Report
|
||||||
|
|
||||||
|
When reporting a problem, include:
|
||||||
|
|
||||||
|
- target URL
|
||||||
|
- command used
|
||||||
|
- selected format
|
||||||
|
- whether `--raw-html` returned a challenge or normal page HTML
|
||||||
|
- whether `--browser firefox` changed the result
|
||||||
|
- whether cloud mode changed the result
|
||||||
|
|
||||||
|
Remove cookies, tokens, customer data, and private URLs before sharing logs.
|
||||||
60
examples/firecrawl-compatible-api/README.md
Normal file
60
examples/firecrawl-compatible-api/README.md
Normal file
|
|
@ -0,0 +1,60 @@
|
||||||
|
# Firecrawl-Compatible API
|
||||||
|
|
||||||
|
webclaw exposes Firecrawl-compatible v2 routes for teams migrating existing scrape, crawl, map, or search calls.
|
||||||
|
|
||||||
|
## Scrape
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl https://api.webclaw.io/v2/scrape \
|
||||||
|
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"url": "https://example.com",
|
||||||
|
"formats": ["markdown"]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Crawl
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl https://api.webclaw.io/v2/crawl \
|
||||||
|
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"url": "https://docs.example.com",
|
||||||
|
"limit": 25,
|
||||||
|
"maxDepth": 2
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Poll the returned crawl id:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl https://api.webclaw.io/v2/crawl/$CRAWL_ID \
|
||||||
|
-H "Authorization: Bearer $WEBCLAW_API_KEY"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Map
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl https://api.webclaw.io/v2/map \
|
||||||
|
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"url": "https://docs.example.com"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Search
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl https://api.webclaw.io/v2/search \
|
||||||
|
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"query": "site:docs.rs tokio tutorial",
|
||||||
|
"limit": 5
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Compatibility routes are meant to reduce migration friction. For new projects, prefer the native `/v1` API because it exposes webclaw-specific options more directly.
|
||||||
50
examples/html-to-markdown-rag/README.md
Normal file
50
examples/html-to-markdown-rag/README.md
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
# HTML to Markdown for RAG
|
||||||
|
|
||||||
|
Turn web pages into clean markdown or compact LLM text before chunking, embedding, or passing the page to an agent.
|
||||||
|
|
||||||
|
## CLI
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clean markdown with headings, links, and readable structure.
|
||||||
|
webclaw https://docs.anthropic.com --format markdown > page.md
|
||||||
|
|
||||||
|
# Token-optimized output for direct LLM context.
|
||||||
|
webclaw https://docs.anthropic.com --format llm > page.txt
|
||||||
|
|
||||||
|
# Keep the main article content and remove common navigation/footer noise.
|
||||||
|
webclaw https://docs.anthropic.com \
|
||||||
|
--only-main-content \
|
||||||
|
--format markdown \
|
||||||
|
> page.md
|
||||||
|
```
|
||||||
|
|
||||||
|
## Batch a URL List
|
||||||
|
|
||||||
|
Create `urls.txt`:
|
||||||
|
|
||||||
|
```text
|
||||||
|
https://docs.anthropic.com/
|
||||||
|
https://docs.anthropic.com/en/docs/claude-code
|
||||||
|
https://docs.anthropic.com/en/api/messages
|
||||||
|
```
|
||||||
|
|
||||||
|
Run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw --urls-file urls.txt --format llm > corpus.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Hosted API
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl https://api.webclaw.io/v1/scrape \
|
||||||
|
-H "Authorization: Bearer $WEBCLAW_API_KEY" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"url": "https://docs.anthropic.com",
|
||||||
|
"formats": ["markdown", "llm"],
|
||||||
|
"only_main_content": true
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Use `markdown` when humans may inspect the output. Use `llm` when the next step is chunking, embedding, summarization, or prompt context.
|
||||||
44
examples/mcp-web-scraping/README.md
Normal file
44
examples/mcp-web-scraping/README.md
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
# MCP Web Scraping
|
||||||
|
|
||||||
|
Use webclaw as a local MCP server so Claude Code, Claude Desktop, Cursor, Windsurf, OpenCode, Codex CLI, or another MCP client can fetch clean web context.
|
||||||
|
|
||||||
|
## Install
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npx create-webclaw
|
||||||
|
```
|
||||||
|
|
||||||
|
The installer detects supported MCP clients and can write the config for you.
|
||||||
|
|
||||||
|
## Manual Config
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"webclaw": {
|
||||||
|
"command": "~/.webclaw/webclaw-mcp",
|
||||||
|
"env": {
|
||||||
|
"WEBCLAW_API_KEY": "wc_your_key"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`WEBCLAW_API_KEY` is optional for local extraction. Add it when you want cloud fallback for protected sites, JS rendering, hosted search, or hosted research.
|
||||||
|
|
||||||
|
## Example Prompts
|
||||||
|
|
||||||
|
```text
|
||||||
|
Scrape https://docs.rs/tokio and summarize the parts about task spawning.
|
||||||
|
```
|
||||||
|
|
||||||
|
```text
|
||||||
|
Crawl https://docs.example.com up to depth 2 and return the pages most relevant to authentication.
|
||||||
|
```
|
||||||
|
|
||||||
|
```text
|
||||||
|
Extract the pricing tiers from https://example.com/pricing as JSON with fields name, price, limits, and features.
|
||||||
|
```
|
||||||
|
|
||||||
|
The MCP server exposes tools for scrape, crawl, map, batch, extract, summarize, diff, brand, research, search, and vertical extractors.
|
||||||
53
examples/proxy-backed-crawling/README.md
Normal file
53
examples/proxy-backed-crawling/README.md
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
# Proxy-Backed Crawling
|
||||||
|
|
||||||
|
Use proxy rotation when you need to distribute a crawl across a proxy pool. webclaw supports a single proxy or a proxy file.
|
||||||
|
|
||||||
|
## Single Proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://example.com \
|
||||||
|
--proxy http://user:pass@proxy.example.com:8080 \
|
||||||
|
--format markdown
|
||||||
|
```
|
||||||
|
|
||||||
|
SOCKS5 is supported too:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://example.com \
|
||||||
|
--proxy socks5://proxy.example.com:1080 \
|
||||||
|
--format markdown
|
||||||
|
```
|
||||||
|
|
||||||
|
## Proxy Pool
|
||||||
|
|
||||||
|
Create `proxies.txt` with one proxy per line:
|
||||||
|
|
||||||
|
```text
|
||||||
|
http://user:pass@proxy-1.example.com:8080
|
||||||
|
http://user:pass@proxy-2.example.com:8080
|
||||||
|
http://user:pass@proxy-3.example.com:8080
|
||||||
|
```
|
||||||
|
|
||||||
|
Run a crawl with controlled concurrency:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw https://docs.example.com \
|
||||||
|
--crawl \
|
||||||
|
--depth 2 \
|
||||||
|
--max-pages 100 \
|
||||||
|
--concurrency 10 \
|
||||||
|
--delay 200 \
|
||||||
|
--proxy-file proxies.txt \
|
||||||
|
--format markdown
|
||||||
|
```
|
||||||
|
|
||||||
|
## Batch URLs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
webclaw --urls-file urls.txt \
|
||||||
|
--proxy-file proxies.txt \
|
||||||
|
--concurrency 10 \
|
||||||
|
--format json
|
||||||
|
```
|
||||||
|
|
||||||
|
Proxy rotation helps with throughput and IP reputation. It does not replace request fingerprinting, JS rendering, or challenge handling for heavily protected sites. For those, use hosted cloud mode with `WEBCLAW_API_KEY`.
|
||||||
4
setup.sh
4
setup.sh
|
|
@ -36,7 +36,7 @@ prompt() {
|
||||||
printf "${CYAN} %s${RESET}: " "$prompt_text"
|
printf "${CYAN} %s${RESET}: " "$prompt_text"
|
||||||
fi
|
fi
|
||||||
read -r input
|
read -r input
|
||||||
eval "$var_name=\"${input:-$default}\""
|
printf -v "$var_name" '%s' "${input:-$default}"
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt_secret() {
|
prompt_secret() {
|
||||||
|
|
@ -48,7 +48,7 @@ prompt_secret() {
|
||||||
fi
|
fi
|
||||||
read -rs input
|
read -rs input
|
||||||
echo
|
echo
|
||||||
eval "$var_name=\"${input:-$default}\""
|
printf -v "$var_name" '%s' "${input:-$default}"
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt_yn() {
|
prompt_yn() {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue